Skip to content

Commit 105edc2

Browse files
lidavidmpitrou
authored andcommitted
ARROW-12598: [C++][Dataset] Speed up CountRows for CSV
This does not implement a fast path for CSV. However, it does configure the CSV reader to not actually deserialize any data, resulting in a large gain. When scanning 85 million rows of the NYC Taxi dataset, scan time dropped from ~7 seconds to 2. This also sneaks in an implementation of the fast path for InMemoryFragment. Closes apache#10270 from lidavidm/arrow-12598 Authored-by: David Li <li.davidm96@gmail.com> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 8103972 commit 105edc2

16 files changed

Lines changed: 246 additions & 52 deletions

cpp/src/arrow/csv/reader.cc

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,70 @@ Future<std::shared_ptr<StreamingReader>> MakeStreamingReader(
10271027
return reader->Init();
10281028
}
10291029

1030+
/////////////////////////////////////////////////////////////////////////
1031+
// Row count implementation
1032+
1033+
class CSVRowCounter : public ReaderMixin,
1034+
public std::enable_shared_from_this<CSVRowCounter> {
1035+
public:
1036+
CSVRowCounter(io::IOContext io_context, Executor* cpu_executor,
1037+
std::shared_ptr<io::InputStream> input, const ReadOptions& read_options,
1038+
const ParseOptions& parse_options)
1039+
: ReaderMixin(io_context, std::move(input), read_options, parse_options,
1040+
ConvertOptions::Defaults(), /*count_rows=*/true),
1041+
cpu_executor_(cpu_executor),
1042+
row_count_(0) {}
1043+
1044+
Future<int64_t> Count() {
1045+
auto self = shared_from_this();
1046+
return Init(self).Then([self]() { return self->DoCount(self); });
1047+
}
1048+
1049+
private:
1050+
Future<> Init(const std::shared_ptr<CSVRowCounter>& self) {
1051+
ARROW_ASSIGN_OR_RAISE(auto istream_it,
1052+
io::MakeInputStreamIterator(input_, read_options_.block_size));
1053+
// TODO Consider exposing readahead as a read option (ARROW-12090)
1054+
ARROW_ASSIGN_OR_RAISE(auto bg_it, MakeBackgroundGenerator(std::move(istream_it),
1055+
io_context_.executor()));
1056+
auto transferred_it = MakeTransferredGenerator(bg_it, cpu_executor_);
1057+
auto buffer_generator = CSVBufferIterator::MakeAsync(std::move(transferred_it));
1058+
1059+
return buffer_generator().Then([self, buffer_generator](
1060+
std::shared_ptr<Buffer> first_buffer) {
1061+
if (!first_buffer) {
1062+
return Status::Invalid("Empty CSV file");
1063+
}
1064+
RETURN_NOT_OK(self->ProcessHeader(first_buffer, &first_buffer));
1065+
self->block_generator_ = SerialBlockReader::MakeAsyncIterator(
1066+
buffer_generator, MakeChunker(self->parse_options_), std::move(first_buffer));
1067+
return Status::OK();
1068+
});
1069+
}
1070+
1071+
Future<int64_t> DoCount(const std::shared_ptr<CSVRowCounter>& self) {
1072+
// We must return a value instead of Status/Future<> to work with MakeMappedGenerator,
1073+
// and we must use a type with a valid end value to work with IterationEnd.
1074+
std::function<Result<util::optional<int64_t>>(const CSVBlock&)> count_cb =
1075+
[self](const CSVBlock& maybe_block) -> Result<util::optional<int64_t>> {
1076+
ARROW_ASSIGN_OR_RAISE(
1077+
auto parser,
1078+
self->Parse(maybe_block.partial, maybe_block.completion, maybe_block.buffer,
1079+
maybe_block.block_index, maybe_block.is_final));
1080+
RETURN_NOT_OK(maybe_block.consume_bytes(parser.parsed_bytes));
1081+
self->row_count_ += parser.parser->num_rows();
1082+
return parser.parser->num_rows();
1083+
};
1084+
auto count_gen = MakeMappedGenerator(block_generator_, std::move(count_cb));
1085+
return DiscardAllFromAsyncGenerator(count_gen).Then(
1086+
[self]() { return self->row_count_; });
1087+
}
1088+
1089+
Executor* cpu_executor_;
1090+
AsyncGenerator<CSVBlock> block_generator_;
1091+
int64_t row_count_;
1092+
};
1093+
10301094
} // namespace
10311095

10321096
/////////////////////////////////////////////////////////////////////////
@@ -1081,6 +1145,16 @@ Future<std::shared_ptr<StreamingReader>> StreamingReader::MakeAsync(
10811145
parse_options, convert_options);
10821146
}
10831147

1148+
Future<int64_t> CountRowsAsync(io::IOContext io_context,
1149+
std::shared_ptr<io::InputStream> input,
1150+
internal::Executor* cpu_executor,
1151+
const ReadOptions& read_options,
1152+
const ParseOptions& parse_options) {
1153+
auto counter = std::make_shared<CSVRowCounter>(
1154+
io_context, cpu_executor, std::move(input), read_options, parse_options);
1155+
return counter->Count();
1156+
}
1157+
10841158
} // namespace csv
10851159

10861160
} // namespace arrow

cpp/src/arrow/csv/reader.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,5 +96,13 @@ class ARROW_EXPORT StreamingReader : public RecordBatchReader {
9696
const ConvertOptions& convert_options);
9797
};
9898

99+
/// \brief Count the logical rows of data in a CSV file (i.e. the
100+
/// number of rows you would get if you read the file into a table).
101+
ARROW_EXPORT
102+
Future<int64_t> CountRowsAsync(io::IOContext io_context,
103+
std::shared_ptr<io::InputStream> input,
104+
internal::Executor* cpu_executor, const ReadOptions&,
105+
const ParseOptions&);
106+
99107
} // namespace csv
100108
} // namespace arrow

cpp/src/arrow/csv/reader_test.cc

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,5 +216,55 @@ TEST(StreamingReaderTests, NestedParallelism) {
216216
TestNestedParallelism(thread_pool, table_factory);
217217
}
218218

219+
TEST(CountRowsAsync, Basics) {
220+
constexpr int NROWS = 4096;
221+
ASSERT_OK_AND_ASSIGN(auto table_buffer, MakeSampleCsvBuffer(NROWS));
222+
{
223+
auto reader = std::make_shared<io::BufferReader>(table_buffer);
224+
auto read_options = ReadOptions::Defaults();
225+
auto parse_options = ParseOptions::Defaults();
226+
ASSERT_FINISHES_OK_AND_EQ(
227+
NROWS, CountRowsAsync(io::default_io_context(), reader,
228+
internal::GetCpuThreadPool(), read_options, parse_options));
229+
}
230+
{
231+
auto reader = std::make_shared<io::BufferReader>(table_buffer);
232+
auto read_options = ReadOptions::Defaults();
233+
read_options.skip_rows = 20;
234+
auto parse_options = ParseOptions::Defaults();
235+
ASSERT_FINISHES_OK_AND_EQ(NROWS - 20, CountRowsAsync(io::default_io_context(), reader,
236+
internal::GetCpuThreadPool(),
237+
read_options, parse_options));
238+
}
239+
{
240+
auto reader = std::make_shared<io::BufferReader>(table_buffer);
241+
auto read_options = ReadOptions::Defaults();
242+
read_options.autogenerate_column_names = true;
243+
auto parse_options = ParseOptions::Defaults();
244+
ASSERT_FINISHES_OK_AND_EQ(NROWS + 1, CountRowsAsync(io::default_io_context(), reader,
245+
internal::GetCpuThreadPool(),
246+
read_options, parse_options));
247+
}
248+
{
249+
auto reader = std::make_shared<io::BufferReader>(table_buffer);
250+
auto read_options = ReadOptions::Defaults();
251+
read_options.block_size = 1024;
252+
auto parse_options = ParseOptions::Defaults();
253+
ASSERT_FINISHES_OK_AND_EQ(
254+
NROWS, CountRowsAsync(io::default_io_context(), reader,
255+
internal::GetCpuThreadPool(), read_options, parse_options));
256+
}
257+
}
258+
259+
TEST(CountRowsAsync, Errors) {
260+
ASSERT_OK_AND_ASSIGN(auto table_buffer, MakeSampleCsvBuffer(4096, /*valid=*/false));
261+
auto reader = std::make_shared<io::BufferReader>(table_buffer);
262+
auto read_options = ReadOptions::Defaults();
263+
auto parse_options = ParseOptions::Defaults();
264+
ASSERT_FINISHES_AND_RAISES(
265+
Invalid, CountRowsAsync(io::default_io_context(), reader,
266+
internal::GetCpuThreadPool(), read_options, parse_options));
267+
}
268+
219269
} // namespace csv
220270
} // namespace arrow

cpp/src/arrow/dataset/dataset.cc

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ Result<std::shared_ptr<Schema>> Fragment::ReadPhysicalSchema() {
5353
}
5454

5555
Future<util::optional<int64_t>> Fragment::CountRows(compute::Expression,
56-
std::shared_ptr<ScanOptions>) {
56+
const std::shared_ptr<ScanOptions>&) {
5757
return Future<util::optional<int64_t>>::MakeFinished(util::nullopt);
5858
}
5959

@@ -149,6 +149,18 @@ Result<RecordBatchGenerator> InMemoryFragment::ScanBatchesAsync(
149149
options->batch_size);
150150
}
151151

152+
Future<util::optional<int64_t>> InMemoryFragment::CountRows(
153+
compute::Expression predicate, const std::shared_ptr<ScanOptions>& options) {
154+
if (ExpressionHasFieldRefs(predicate)) {
155+
return Future<util::optional<int64_t>>::MakeFinished(util::nullopt);
156+
}
157+
int64_t total = 0;
158+
for (const auto& batch : record_batches_) {
159+
total += batch->num_rows();
160+
}
161+
return Future<util::optional<int64_t>>::MakeFinished(total);
162+
}
163+
152164
Dataset::Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression)
153165
: schema_(std::move(schema)),
154166
partition_expression_(std::move(partition_expression)) {}

cpp/src/arrow/dataset/dataset.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
7676
///
7777
/// If this is not possible, resolve with an empty optional. The fragment can perform
7878
/// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request.
79-
virtual Future<util::optional<int64_t>> CountRows(compute::Expression predicate,
80-
std::shared_ptr<ScanOptions> options);
79+
virtual Future<util::optional<int64_t>> CountRows(
80+
compute::Expression predicate, const std::shared_ptr<ScanOptions>& options);
8181

8282
virtual std::string type_name() const = 0;
8383
virtual std::string ToString() const { return type_name(); }
@@ -133,6 +133,9 @@ class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
133133
Result<ScanTaskIterator> Scan(std::shared_ptr<ScanOptions> options) override;
134134
Result<RecordBatchGenerator> ScanBatchesAsync(
135135
const std::shared_ptr<ScanOptions>& options) override;
136+
Future<util::optional<int64_t>> CountRows(
137+
compute::Expression predicate,
138+
const std::shared_ptr<ScanOptions>& options) override;
136139

137140
std::string type_name() const override { return "in-memory"; }
138141

cpp/src/arrow/dataset/file_base.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ Result<std::shared_ptr<io::InputStream>> FileSource::OpenCompressed(
8686

8787
Future<util::optional<int64_t>> FileFormat::CountRows(
8888
const std::shared_ptr<FileFragment>&, compute::Expression,
89-
std::shared_ptr<ScanOptions>) {
89+
const std::shared_ptr<ScanOptions>&) {
9090
return Future<util::optional<int64_t>>::MakeFinished(util::nullopt);
9191
}
9292

@@ -176,14 +176,14 @@ Result<RecordBatchGenerator> FileFragment::ScanBatchesAsync(
176176
}
177177

178178
Future<util::optional<int64_t>> FileFragment::CountRows(
179-
compute::Expression predicate, std::shared_ptr<ScanOptions> options) {
179+
compute::Expression predicate, const std::shared_ptr<ScanOptions>& options) {
180180
ARROW_ASSIGN_OR_RAISE(predicate, compute::SimplifyWithGuarantee(std::move(predicate),
181181
partition_expression_));
182182
if (!predicate.IsSatisfiable()) {
183183
return Future<util::optional<int64_t>>::MakeFinished(0);
184184
}
185185
auto self = internal::checked_pointer_cast<FileFragment>(shared_from_this());
186-
return format()->CountRows(self, std::move(predicate), std::move(options));
186+
return format()->CountRows(self, std::move(predicate), options);
187187
}
188188

189189
struct FileSystemDataset::FragmentSubtrees {

cpp/src/arrow/dataset/file_base.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileForma
157157
const std::shared_ptr<FileFragment>& file) const;
158158
virtual Future<util::optional<int64_t>> CountRows(
159159
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
160-
std::shared_ptr<ScanOptions> options);
160+
const std::shared_ptr<ScanOptions>& options);
161161

162162
/// \brief Open a fragment
163163
virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
@@ -188,7 +188,8 @@ class ARROW_DS_EXPORT FileFragment : public Fragment {
188188
Result<RecordBatchGenerator> ScanBatchesAsync(
189189
const std::shared_ptr<ScanOptions>& options) override;
190190
Future<util::optional<int64_t>> CountRows(
191-
compute::Expression predicate, std::shared_ptr<ScanOptions> options) override;
191+
compute::Expression predicate,
192+
const std::shared_ptr<ScanOptions>& options) override;
192193

193194
std::string type_name() const override { return format_->type_name(); }
194195
std::string ToString() const override { return source_.path(); };

cpp/src/arrow/dataset/file_csv.cc

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,18 +48,29 @@ using internal::SerialExecutor;
4848
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
4949

5050
Result<std::unordered_set<std::string>> GetColumnNames(
51-
const csv::ParseOptions& parse_options, util::string_view first_block,
52-
MemoryPool* pool) {
51+
const csv::ReadOptions& read_options, const csv::ParseOptions& parse_options,
52+
util::string_view first_block, MemoryPool* pool) {
53+
if (!read_options.column_names.empty()) {
54+
std::unordered_set<std::string> column_names;
55+
for (const auto& s : read_options.column_names) {
56+
if (!column_names.emplace(s).second) {
57+
return Status::Invalid("CSV file contained multiple columns named ", s);
58+
}
59+
}
60+
return column_names;
61+
}
62+
5363
uint32_t parsed_size = 0;
64+
int32_t max_num_rows = read_options.skip_rows + 1;
5465
csv::BlockParser parser(pool, parse_options, /*num_cols=*/-1, /*first_row=*/1,
55-
/*max_num_rows=*/1);
66+
max_num_rows);
5667

5768
RETURN_NOT_OK(parser.Parse(util::string_view{first_block}, &parsed_size));
5869

59-
if (parser.num_rows() != 1) {
60-
return Status::Invalid(
61-
"Could not read first row from CSV file, either "
62-
"file is truncated or header is larger than block size");
70+
if (parser.num_rows() != max_num_rows) {
71+
return Status::Invalid("Could not read first ", max_num_rows,
72+
" rows from CSV file, either file is truncated or"
73+
" header is larger than block size");
6374
}
6475

6576
if (parser.num_cols() == 0) {
@@ -83,15 +94,15 @@ Result<std::unordered_set<std::string>> GetColumnNames(
8394
static inline Result<csv::ConvertOptions> GetConvertOptions(
8495
const CsvFileFormat& format, const ScanOptions* scan_options,
8596
const util::string_view first_block) {
86-
ARROW_ASSIGN_OR_RAISE(
87-
auto column_names,
88-
GetColumnNames(format.parse_options, first_block,
89-
scan_options ? scan_options->pool : default_memory_pool()));
90-
9197
ARROW_ASSIGN_OR_RAISE(
9298
auto csv_scan_options,
9399
GetFragmentScanOptions<CsvFragmentScanOptions>(
94100
kCsvTypeName, scan_options, format.default_fragment_scan_options));
101+
ARROW_ASSIGN_OR_RAISE(
102+
auto column_names,
103+
GetColumnNames(csv_scan_options->read_options, format.parse_options, first_block,
104+
scan_options ? scan_options->pool : default_memory_pool()));
105+
95106
auto convert_options = csv_scan_options->convert_options;
96107

97108
if (!scan_options) return convert_options;
@@ -257,5 +268,20 @@ Result<RecordBatchGenerator> CsvFileFormat::ScanBatchesAsync(
257268
return GeneratorFromReader(std::move(reader_fut));
258269
}
259270

271+
Future<util::optional<int64_t>> CsvFileFormat::CountRows(
272+
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
273+
const std::shared_ptr<ScanOptions>& options) {
274+
if (ExpressionHasFieldRefs(predicate)) {
275+
return Future<util::optional<int64_t>>::MakeFinished(util::nullopt);
276+
}
277+
auto self = internal::checked_pointer_cast<CsvFileFormat>(shared_from_this());
278+
ARROW_ASSIGN_OR_RAISE(auto input, file->source().OpenCompressed());
279+
ARROW_ASSIGN_OR_RAISE(auto read_options, GetReadOptions(*self, options));
280+
return csv::CountRowsAsync(options->io_context, std::move(input),
281+
internal::GetCpuThreadPool(), read_options,
282+
self->parse_options)
283+
.Then([](int64_t count) { return util::make_optional<int64_t>(count); });
284+
}
285+
260286
} // namespace dataset
261287
} // namespace arrow

cpp/src/arrow/dataset/file_csv.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
6161
const std::shared_ptr<ScanOptions>& scan_options,
6262
const std::shared_ptr<FileFragment>& file) const override;
6363

64+
Future<util::optional<int64_t>> CountRows(
65+
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
66+
const std::shared_ptr<ScanOptions>& options) override;
67+
6468
Result<std::shared_ptr<FileWriter>> MakeWriter(
6569
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
6670
std::shared_ptr<FileWriteOptions> options) const override {

0 commit comments

Comments
 (0)