Skip to content

Commit 5ebab5a

Browse files
pitrouwesm
authored andcommitted
ARROW-25: [C++] Implement CSV reader
This includes: - a CSV table reader written in C++ - a Python wrapper around the CSV table reader - simple type inference for CSV values (null -> int64 -> float64 -> binary) - generic null parsing using Pandas defaults as a baseline ("NA", "N/A", "NaN"...) - some simple syntax parameters for CSV parsing Not included: - conversion and typing options - performance tuning Author: Antoine Pitrou <antoine@python.org> Closes apache#2576 from pitrou/ARROW-25-csv-reader and squashes the following commits: 4ae93b2 <Antoine Pitrou> ARROW-25: Implement CSV reader
1 parent b83db61 commit 5ebab5a

33 files changed

Lines changed: 4553 additions & 7 deletions

cpp/src/arrow/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@ set(ARROW_SRCS
3030
type.cc
3131
visitor.cc
3232

33+
csv/converter.cc
34+
csv/chunker.cc
35+
csv/column-builder.cc
36+
csv/options.cc
37+
csv/parser.cc
38+
csv/reader.cc
39+
3340
io/buffered.cc
3441
io/file.cc
3542
io/interfaces.cc
@@ -43,6 +50,7 @@ set(ARROW_SRCS
4350
util/hash.cc
4451
util/io-util.cc
4552
util/key_value_metadata.cc
53+
util/task-group.cc
4654
util/thread-pool.cc
4755
util/logging.cc
4856
)
@@ -217,5 +225,6 @@ ADD_ARROW_TEST(tensor-test)
217225
ADD_ARROW_BENCHMARK(builder-benchmark)
218226
ADD_ARROW_BENCHMARK(column-benchmark)
219227

228+
add_subdirectory(csv)
220229
add_subdirectory(io)
221230
add_subdirectory(util)

cpp/src/arrow/csv/CMakeLists.txt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
ADD_ARROW_TEST(csv-chunker-test)
19+
ADD_ARROW_TEST(csv-column-builder-test)
20+
ADD_ARROW_TEST(csv-converter-test)
21+
ADD_ARROW_TEST(csv-parser-test)
22+
23+
ADD_ARROW_BENCHMARK(csv-converter-benchmark)
24+
ADD_ARROW_BENCHMARK(csv-parser-benchmark)
25+
26+
# Headers: top level
27+
file(GLOB_RECURSE ARROW_CSV_HEADERS "*.h")
28+
29+
install(FILES
30+
${ARROW_CSV_HEADERS}
31+
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/csv")

cpp/src/arrow/csv/api.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#ifndef ARROW_CSV_API_H
19+
#define ARROW_CSV_API_H
20+
21+
#include "arrow/csv/options.h"
22+
#include "arrow/csv/reader.h"
23+
24+
#endif // ARROW_CSV_API_H

cpp/src/arrow/csv/chunker.cc

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/csv/chunker.h"
19+
#include "arrow/status.h"
20+
#include "arrow/util/logging.h"
21+
22+
#include <sstream>
23+
#include <string>
24+
25+
namespace arrow {
26+
namespace csv {
27+
28+
Chunker::Chunker(ParseOptions options, int32_t max_num_rows)
29+
: options_(options), max_num_rows_(max_num_rows) {}
30+
31+
// NOTE: cvsmonkey (https://github.com/dw/csvmonkey) has optimization ideas
32+
33+
template <bool quoting, bool escaping>
34+
inline const char* Chunker::ReadLine(const char* data, const char* data_end) {
35+
DCHECK_EQ(quoting, options_.quoting);
36+
DCHECK_EQ(escaping, options_.escaping);
37+
38+
// The parsing state machine
39+
char c;
40+
41+
FieldStart:
42+
// At the start of a field
43+
// Quoting is only recognized at start of field
44+
if (quoting && ARROW_PREDICT_TRUE(data != data_end) && *data == options_.quote_char) {
45+
data++;
46+
goto InQuotedField;
47+
} else {
48+
goto InField;
49+
}
50+
51+
InField:
52+
// Inside a non-quoted part of a field
53+
if (ARROW_PREDICT_FALSE(data == data_end)) {
54+
goto AbortLine;
55+
}
56+
c = *data++;
57+
if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
58+
if (ARROW_PREDICT_FALSE(data == data_end)) {
59+
goto AbortLine;
60+
}
61+
data++;
62+
goto InField;
63+
}
64+
if (ARROW_PREDICT_FALSE(c == '\r')) {
65+
if (ARROW_PREDICT_TRUE(data != data_end) && *data == '\n') {
66+
data++;
67+
}
68+
goto LineEnd;
69+
}
70+
if (ARROW_PREDICT_FALSE(c == '\n')) {
71+
goto LineEnd;
72+
}
73+
if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
74+
goto FieldEnd;
75+
}
76+
goto InField;
77+
78+
InQuotedField:
79+
// Inside a quoted part of a field
80+
if (ARROW_PREDICT_FALSE(data == data_end)) {
81+
goto AbortLine;
82+
}
83+
c = *data++;
84+
if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
85+
if (data == data_end) {
86+
goto AbortLine;
87+
}
88+
data++;
89+
goto InQuotedField;
90+
}
91+
if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
92+
if (options_.double_quote && data != data_end && *data == options_.quote_char) {
93+
// Double-quoting
94+
data++;
95+
} else {
96+
// End of single-quoting
97+
goto InField;
98+
}
99+
}
100+
goto InQuotedField;
101+
102+
FieldEnd:
103+
// At the end of a field
104+
goto FieldStart;
105+
106+
LineEnd:
107+
// At the end of line, possibly in the middle of the newline separator
108+
// if (ARROW_PREDICT_TRUE(data < data_end) && data[-1] == '\r' && *data == '\n') {
109+
// data++;
110+
// }
111+
return data;
112+
113+
AbortLine:
114+
// Truncated line at end of block
115+
return nullptr;
116+
}
117+
118+
template <bool quoting, bool escaping>
119+
Status Chunker::ProcessSpecialized(const char* start, uint32_t size, uint32_t* out_size) {
120+
DCHECK_EQ(quoting, options_.quoting);
121+
DCHECK_EQ(escaping, options_.escaping);
122+
123+
num_rows_ = 0;
124+
const char* data = start;
125+
const char* data_end = start + size;
126+
127+
while (data < data_end && num_rows_ < max_num_rows_) {
128+
const char* line_end = ReadLine<quoting, escaping>(data, data_end);
129+
if (line_end == nullptr) {
130+
// Cannot read any further
131+
break;
132+
}
133+
data = line_end;
134+
++num_rows_;
135+
}
136+
*out_size = static_cast<uint32_t>(data - start);
137+
return Status::OK();
138+
}
139+
140+
Status Chunker::Process(const char* start, uint32_t size, uint32_t* out_size) {
141+
if (options_.quoting) {
142+
if (options_.escaping) {
143+
return ProcessSpecialized<true, true>(start, size, out_size);
144+
} else {
145+
return ProcessSpecialized<true, false>(start, size, out_size);
146+
}
147+
} else {
148+
if (options_.escaping) {
149+
return ProcessSpecialized<false, true>(start, size, out_size);
150+
} else {
151+
return ProcessSpecialized<false, false>(start, size, out_size);
152+
}
153+
}
154+
}
155+
156+
} // namespace csv
157+
} // namespace arrow

cpp/src/arrow/csv/chunker.h

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#ifndef ARROW_CSV_CHUNKER_H
19+
#define ARROW_CSV_CHUNKER_H
20+
21+
#include <cstdint>
22+
#include <vector>
23+
24+
#include "arrow/csv/options.h"
25+
#include "arrow/status.h"
26+
#include "arrow/util/visibility.h"
27+
28+
namespace arrow {
29+
namespace csv {
30+
31+
constexpr int32_t kMaxChunkerNumRows = 100000;
32+
33+
/// \class Chunker
34+
/// \brief A reusable block-based chunker for CSV data
35+
///
36+
/// The chunker takes a block of CSV data and finds a suitable place
37+
/// to cut it up without splitting a row.
38+
/// If the block is truncated (i.e. not all data can be chunked), it is up
39+
/// to the caller to arrange the next block to start with the trailing data.
40+
///
41+
/// Note: if the previous block ends with CR (0x0d) and a new block starts
42+
/// with LF (0x0a), the chunker will consider the leading newline as an empty line.
43+
class ARROW_EXPORT Chunker {
44+
public:
45+
explicit Chunker(ParseOptions options, int32_t max_num_rows = kMaxChunkerNumRows);
46+
47+
/// \brief Carve up a chunk in a block of data
48+
///
49+
/// Process a block of CSV data, reading up to max_num_rows rows.
50+
/// The number of bytes in the chunk is returned in out_size.
51+
Status Process(const char* data, uint32_t size, uint32_t* out_size);
52+
53+
int32_t num_rows() const { return num_rows_; }
54+
55+
protected:
56+
ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker);
57+
58+
// Like Process(), but specialized for some parsing options
59+
template <bool quoting, bool escaping>
60+
Status ProcessSpecialized(const char* data, uint32_t size, uint32_t* out_size);
61+
62+
// Detect a single line from the data pointer. Return the line end,
63+
// or nullptr if the remaining line is truncated.
64+
template <bool quoting, bool escaping>
65+
inline const char* ReadLine(const char* data, const char* data_end);
66+
67+
ParseOptions options_;
68+
// The number of rows chunked from the block
69+
int32_t num_rows_;
70+
// The maximum number of rows to chunk from this block
71+
int32_t max_num_rows_;
72+
};
73+
74+
} // namespace csv
75+
} // namespace arrow
76+
77+
#endif // ARROW_CSV_CHUNKER_H

0 commit comments

Comments
 (0)