@@ -1027,6 +1027,70 @@ Future<std::shared_ptr<StreamingReader>> MakeStreamingReader(
10271027 return reader->Init ();
10281028}
10291029
1030+ // ///////////////////////////////////////////////////////////////////////
1031+ // Row count implementation
1032+
1033+ class CSVRowCounter : public ReaderMixin ,
1034+ public std::enable_shared_from_this<CSVRowCounter> {
1035+ public:
1036+ CSVRowCounter (io::IOContext io_context, Executor* cpu_executor,
1037+ std::shared_ptr<io::InputStream> input, const ReadOptions& read_options,
1038+ const ParseOptions& parse_options)
1039+ : ReaderMixin(io_context, std::move(input), read_options, parse_options,
1040+ ConvertOptions::Defaults (), /* count_rows=*/ true),
1041+ cpu_executor_(cpu_executor),
1042+ row_count_(0 ) {}
1043+
1044+ Future<int64_t > Count () {
1045+ auto self = shared_from_this ();
1046+ return Init (self).Then ([self]() { return self->DoCount (self); });
1047+ }
1048+
1049+ private:
1050+ Future<> Init (const std::shared_ptr<CSVRowCounter>& self) {
1051+ ARROW_ASSIGN_OR_RAISE (auto istream_it,
1052+ io::MakeInputStreamIterator (input_, read_options_.block_size ));
1053+ // TODO Consider exposing readahead as a read option (ARROW-12090)
1054+ ARROW_ASSIGN_OR_RAISE (auto bg_it, MakeBackgroundGenerator (std::move (istream_it),
1055+ io_context_.executor ()));
1056+ auto transferred_it = MakeTransferredGenerator (bg_it, cpu_executor_);
1057+ auto buffer_generator = CSVBufferIterator::MakeAsync (std::move (transferred_it));
1058+
1059+ return buffer_generator ().Then ([self, buffer_generator](
1060+ std::shared_ptr<Buffer> first_buffer) {
1061+ if (!first_buffer) {
1062+ return Status::Invalid (" Empty CSV file" );
1063+ }
1064+ RETURN_NOT_OK (self->ProcessHeader (first_buffer, &first_buffer));
1065+ self->block_generator_ = SerialBlockReader::MakeAsyncIterator (
1066+ buffer_generator, MakeChunker (self->parse_options_ ), std::move (first_buffer));
1067+ return Status::OK ();
1068+ });
1069+ }
1070+
1071+ Future<int64_t > DoCount (const std::shared_ptr<CSVRowCounter>& self) {
1072+ // We must return a value instead of Status/Future<> to work with MakeMappedGenerator,
1073+ // and we must use a type with a valid end value to work with IterationEnd.
1074+ std::function<Result<util::optional<int64_t >>(const CSVBlock&)> count_cb =
1075+ [self](const CSVBlock& maybe_block) -> Result<util::optional<int64_t >> {
1076+ ARROW_ASSIGN_OR_RAISE (
1077+ auto parser,
1078+ self->Parse (maybe_block.partial , maybe_block.completion , maybe_block.buffer ,
1079+ maybe_block.block_index , maybe_block.is_final ));
1080+ RETURN_NOT_OK (maybe_block.consume_bytes (parser.parsed_bytes ));
1081+ self->row_count_ += parser.parser ->num_rows ();
1082+ return parser.parser ->num_rows ();
1083+ };
1084+ auto count_gen = MakeMappedGenerator (block_generator_, std::move (count_cb));
1085+ return DiscardAllFromAsyncGenerator (count_gen).Then (
1086+ [self]() { return self->row_count_ ; });
1087+ }
1088+
1089+ Executor* cpu_executor_;
1090+ AsyncGenerator<CSVBlock> block_generator_;
1091+ int64_t row_count_;
1092+ };
1093+
10301094} // namespace
10311095
10321096// ///////////////////////////////////////////////////////////////////////
@@ -1081,6 +1145,16 @@ Future<std::shared_ptr<StreamingReader>> StreamingReader::MakeAsync(
10811145 parse_options, convert_options);
10821146}
10831147
1148+ Future<int64_t > CountRowsAsync (io::IOContext io_context,
1149+ std::shared_ptr<io::InputStream> input,
1150+ internal::Executor* cpu_executor,
1151+ const ReadOptions& read_options,
1152+ const ParseOptions& parse_options) {
1153+ auto counter = std::make_shared<CSVRowCounter>(
1154+ io_context, cpu_executor, std::move (input), read_options, parse_options);
1155+ return counter->Count ();
1156+ }
1157+
10841158} // namespace csv
10851159
10861160} // namespace arrow
0 commit comments