Skip to content

Commit 3752e34

Browse files
wesmbkietz
authored andcommitted
ARROW-8727: [C++] Don't require stack allocation of any object to use StringConverter, hide behind ParseValue function
Using non-stateful functions or functors is more natural for template-based code generation of e.g. kernels. Closes apache#7120 from wesm/string-converter-static Authored-by: Wes McKinney <wesm+git@apache.org> Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
1 parent 5bc05e2 commit 3752e34

15 files changed

Lines changed: 337 additions & 367 deletions

File tree

cpp/src/arrow/c/bridge.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -652,9 +652,8 @@ class FormatStringParser {
652652
template <typename IntType = int32_t>
653653
Result<IntType> ParseInt(util::string_view v) {
654654
using ArrowIntType = typename CTypeTraits<IntType>::ArrowType;
655-
internal::StringConverter<ArrowIntType> converter;
656655
IntType value;
657-
if (!converter(v.data(), v.size(), &value)) {
656+
if (!internal::ParseValue<ArrowIntType>(v.data(), v.size(), &value)) {
658657
return Invalid();
659658
}
660659
return value;

cpp/src/arrow/compute/kernels/cast.cc

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,15 +1073,13 @@ struct CastFunctor<
10731073

10741074
typename TypeTraits<I>::ArrayType input_array(input.Copy());
10751075
auto out_data = output->GetMutableValues<out_type>(1);
1076-
internal::StringConverter<O> converter;
1077-
10781076
for (int64_t i = 0; i < input.length; ++i, ++out_data) {
10791077
if (input_array.IsNull(i)) {
10801078
continue;
10811079
}
10821080

10831081
auto str = input_array.GetView(i);
1084-
if (!converter(str.data(), str.length(), out_data)) {
1082+
if (!internal::ParseValue<O>(str.data(), str.length(), out_data)) {
10851083
ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ",
10861084
output->type->ToString()));
10871085
return;
@@ -1100,7 +1098,6 @@ struct CastFunctor<BooleanType, I, enable_if_t<is_string_like_type<I>::value>> {
11001098
typename TypeTraits<I>::ArrayType input_array(input.Copy());
11011099
internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
11021100
output->offset, input.length);
1103-
internal::StringConverter<BooleanType> converter;
11041101

11051102
for (int64_t i = 0; i < input.length; ++i) {
11061103
if (input_array.IsNull(i)) {
@@ -1110,7 +1107,7 @@ struct CastFunctor<BooleanType, I, enable_if_t<is_string_like_type<I>::value>> {
11101107

11111108
bool value;
11121109
auto str = input_array.GetView(i);
1113-
if (!converter(str.data(), str.length(), &value)) {
1110+
if (!internal::ParseValue<BooleanType>(str.data(), str.length(), &value)) {
11141111
ctx->SetStatus(Status::Invalid("Failed to cast String '",
11151112
input_array.GetString(i), "' into ",
11161113
output->type->ToString()));
@@ -1139,15 +1136,15 @@ struct CastFunctor<TimestampType, I, enable_if_t<is_string_like_type<I>::value>>
11391136

11401137
typename TypeTraits<I>::ArrayType input_array(input.Copy());
11411138
auto out_data = output->GetMutableValues<out_type>(1);
1142-
internal::StringConverter<TimestampType> converter(output->type);
1139+
1140+
const TimeUnit::type unit = checked_cast<const TimestampType&>(*output->type).unit();
11431141

11441142
for (int64_t i = 0; i < input.length; ++i, ++out_data) {
11451143
if (input_array.IsNull(i)) {
11461144
continue;
11471145
}
1148-
11491146
const auto str = input_array.GetView(i);
1150-
if (!converter(str.data(), str.length(), out_data)) {
1147+
if (!internal::ParseTimestampISO8601(str.data(), str.length(), unit, out_data)) {
11511148
ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ",
11521149
output->type->ToString()));
11531150
return;

cpp/src/arrow/csv/converter.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ namespace arrow {
4040
namespace csv {
4141

4242
using internal::checked_cast;
43-
using internal::StringConverter;
4443
using internal::Trie;
4544
using internal::TrieBuilder;
4645

@@ -349,10 +348,9 @@ class NumericConverter : public ConcreteConverter {
349348
Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
350349
int32_t col_index) override {
351350
using BuilderType = typename TypeTraits<T>::BuilderType;
352-
using value_type = typename StringConverter<T>::value_type;
351+
using value_type = typename T::c_type;
353352

354353
BuilderType builder(type_, pool_);
355-
StringConverter<T> converter;
356354

357355
auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
358356
// XXX should quoted values be allowed at all?
@@ -364,8 +362,8 @@ class NumericConverter : public ConcreteConverter {
364362
if (!std::is_same<BooleanType, T>::value) {
365363
TrimWhiteSpace(&data, &size);
366364
}
367-
if (ARROW_PREDICT_FALSE(
368-
!converter(reinterpret_cast<const char*>(data), size, &value))) {
365+
if (ARROW_PREDICT_FALSE(!internal::ParseValue<T>(
366+
reinterpret_cast<const char*>(data), size, &value))) {
369367
return GenericConversionError(type_, data, size);
370368
}
371369
builder.UnsafeAppend(value);

cpp/src/arrow/filesystem/hdfs.cc

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
namespace arrow {
3333

34+
using internal::ParseValue;
3435
using internal::Uri;
3536

3637
namespace fs {
@@ -312,9 +313,8 @@ Result<HdfsOptions> HdfsOptions::FromUri(const Uri& uri) {
312313
auto it = options_map.find("replication");
313314
if (it != options_map.end()) {
314315
const auto& v = it->second;
315-
::arrow::internal::StringConverter<Int16Type> converter;
316316
int16_t replication;
317-
if (!converter(v.data(), v.size(), &replication)) {
317+
if (!ParseValue<Int16Type>(v.data(), v.size(), &replication)) {
318318
return Status::Invalid("Invalid value for option 'replication': '", v, "'");
319319
}
320320
options.ConfigureReplication(replication);
@@ -324,9 +324,8 @@ Result<HdfsOptions> HdfsOptions::FromUri(const Uri& uri) {
324324
it = options_map.find("buffer_size");
325325
if (it != options_map.end()) {
326326
const auto& v = it->second;
327-
::arrow::internal::StringConverter<Int32Type> converter;
328327
int32_t buffer_size;
329-
if (!converter(v.data(), v.size(), &buffer_size)) {
328+
if (!ParseValue<Int32Type>(v.data(), v.size(), &buffer_size)) {
330329
return Status::Invalid("Invalid value for option 'buffer_size': '", v, "'");
331330
}
332331
options.ConfigureBufferSize(buffer_size);
@@ -336,9 +335,8 @@ Result<HdfsOptions> HdfsOptions::FromUri(const Uri& uri) {
336335
it = options_map.find("default_block_size");
337336
if (it != options_map.end()) {
338337
const auto& v = it->second;
339-
::arrow::internal::StringConverter<Int64Type> converter;
340338
int64_t default_block_size;
341-
if (!converter(v.data(), v.size(), &default_block_size)) {
339+
if (!ParseValue<Int64Type>(v.data(), v.size(), &default_block_size)) {
342340
return Status::Invalid("Invalid value for option 'default_block_size': '", v, "'");
343341
}
344342
options.ConfigureBlockSize(default_block_size);

cpp/src/arrow/ipc/json_internal.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ constexpr char kYearMonth[] = "YEAR_MONTH";
5858
class MemoryPool;
5959

6060
using internal::checked_cast;
61+
using internal::ParseValue;
6162

6263
namespace ipc {
6364
namespace internal {
@@ -1351,6 +1352,7 @@ class ArrayReader {
13511352
template <typename T>
13521353
Status GetIntArray(const RjArray& json_array, const int32_t length,
13531354
std::shared_ptr<Buffer>* out) {
1355+
using ArrowType = typename CTypeTraits<T>::ArrowType;
13541356
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(length * sizeof(T), pool_));
13551357

13561358
T* values = reinterpret_cast<T*>(buffer->mutable_data());
@@ -1367,11 +1369,11 @@ class ArrayReader {
13671369
} else {
13681370
// Read 64-bit integers as strings, as JSON numbers cannot represent
13691371
// them exactly.
1370-
::arrow::internal::StringConverter<typename CTypeTraits<T>::ArrowType> converter;
1372+
13711373
for (int i = 0; i < length; ++i) {
13721374
const rj::Value& val = json_array[i];
13731375
DCHECK(val.IsString());
1374-
if (!converter(val.GetString(), val.GetStringLength(), &values[i])) {
1376+
if (!ParseValue<ArrowType>(val.GetString(), val.GetStringLength(), &values[i])) {
13751377
return Status::Invalid("Failed to parse integer: '",
13761378
std::string(val.GetString(), val.GetStringLength()),
13771379
"'");

cpp/src/arrow/ipc/json_simple.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939
namespace rj = arrow::rapidjson;
4040

4141
namespace arrow {
42+
43+
using internal::ParseValue;
44+
4245
namespace ipc {
4346
namespace internal {
4447
namespace json {
@@ -319,7 +322,7 @@ class DecimalConverter final : public ConcreteConverter<DecimalConverter> {
319322
class TimestampConverter final : public ConcreteConverter<TimestampConverter> {
320323
public:
321324
explicit TimestampConverter(const std::shared_ptr<DataType>& type)
322-
: from_string_(type) {
325+
: parse_ctx_{checked_cast<const TimestampType&>(*type).unit()} {
323326
this->type_ = type;
324327
builder_ = std::make_shared<TimestampBuilder>(type, default_memory_pool());
325328
}
@@ -335,7 +338,7 @@ class TimestampConverter final : public ConcreteConverter<TimestampConverter> {
335338
RETURN_NOT_OK(ConvertNumber<Int64Type>(json_obj, *this->type_, &value));
336339
} else if (json_obj.IsString()) {
337340
auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength());
338-
if (!from_string_(view.data(), view.size(), &value)) {
341+
if (!ParseValue<TimestampType>(view.data(), view.size(), &value, &parse_ctx_)) {
339342
return Status::Invalid("couldn't parse timestamp from ", view);
340343
}
341344
} else {
@@ -347,7 +350,7 @@ class TimestampConverter final : public ConcreteConverter<TimestampConverter> {
347350
std::shared_ptr<ArrayBuilder> builder() override { return builder_; }
348351

349352
private:
350-
::arrow::internal::StringConverter<TimestampType> from_string_;
353+
::arrow::internal::ParseTimestampContext parse_ctx_;
351354
std::shared_ptr<TimestampBuilder> builder_;
352355
};
353356

cpp/src/arrow/json/converter.cc

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,38 @@ class BooleanConverter : public PrimitiveConverter {
103103
};
104104

105105
template <typename T>
106-
class NumericConverter : public PrimitiveConverter {
106+
struct ParserAdapter {
107+
using value_type = typename T::c_type;
108+
109+
void InitParser(const DataType& type) {}
110+
111+
bool ConvertOne(const char* s, size_t length, value_type* out) {
112+
return internal::ParseValue<T>(s, length, out);
113+
}
114+
};
115+
116+
template <>
117+
struct ParserAdapter<TimestampType> {
118+
void InitParser(const DataType& type) {
119+
this->unit = internal::checked_cast<const TimestampType&>(type).unit();
120+
}
121+
122+
bool ConvertOne(const char* s, size_t length, int64_t* out) {
123+
return internal::ParseTimestampISO8601(s, length, unit, out);
124+
}
125+
126+
TimeUnit::type unit;
127+
};
128+
129+
template <typename T>
130+
class NumericConverter : public PrimitiveConverter, public ParserAdapter<T> {
107131
public:
108-
using value_type = typename internal::StringConverter<T>::value_type;
132+
using value_type = typename T::c_type;
109133

110134
NumericConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type)
111-
: PrimitiveConverter(pool, type), convert_one_(type) {}
135+
: PrimitiveConverter(pool, type) {
136+
ParserAdapter<T>::InitParser(*type);
137+
}
112138

113139
Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override {
114140
if (in->type_id() == Type::NA) {
@@ -122,7 +148,7 @@ class NumericConverter : public PrimitiveConverter {
122148

123149
auto visit_valid = [&](string_view repr) {
124150
value_type value;
125-
if (!convert_one_(repr.data(), repr.size(), &value)) {
151+
if (!ParserAdapter<T>::ConvertOne(repr.data(), repr.size(), &value)) {
126152
return GenericConversionError(*out_type_, ", couldn't parse:", repr);
127153
}
128154

@@ -138,8 +164,6 @@ class NumericConverter : public PrimitiveConverter {
138164
RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null));
139165
return builder.Finish(out);
140166
}
141-
142-
internal::StringConverter<T> convert_one_;
143167
};
144168

145169
template <typename DateTimeType>

cpp/src/arrow/python/deserialize.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
namespace arrow {
5151

5252
using internal::checked_cast;
53-
using internal::StringConverter;
53+
using internal::ParseValue;
5454

5555
namespace py {
5656

@@ -232,10 +232,9 @@ Status GetPythonTypes(const UnionArray& data, std::vector<int8_t>* result) {
232232
ARROW_CHECK(result != nullptr);
233233
auto type = data.type();
234234
for (int i = 0; i < type->num_children(); ++i) {
235-
StringConverter<Int8Type> converter;
236235
int8_t tag = 0;
237236
const std::string& data = type->child(i)->name();
238-
if (!converter(data.c_str(), data.size(), &tag)) {
237+
if (!ParseValue<Int8Type>(data.c_str(), data.size(), &tag)) {
239238
return Status::SerializationError("Cannot convert string: \"",
240239
type->child(i)->name(), "\" to int8_t");
241240
}

cpp/src/arrow/scalar.cc

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,16 +220,25 @@ std::string Scalar::ToString() const {
220220
}
221221

222222
struct ScalarParseImpl {
223-
template <typename T, typename Converter = internal::StringConverter<T>,
224-
typename Value = typename Converter::value_type>
223+
// XXX Use of detail here not ideal
224+
template <typename T,
225+
typename Value = typename internal::detail::StringConverter<T>::value_type>
225226
Status Visit(const T& t) {
226227
Value value;
227-
if (!Converter{type_}(s_.data(), s_.size(), &value)) {
228+
if (!internal::ParseValue<T>(s_.data(), s_.size(), &value)) {
228229
return Status::Invalid("error parsing '", s_, "' as scalar of type ", t);
229230
}
230231
return Finish(std::move(value));
231232
}
232233

234+
Status Visit(const TimestampType& t) {
235+
int64_t value;
236+
if (!internal::ParseTimestampISO8601(s_.data(), s_.size(), t.unit(), &value)) {
237+
return Status::Invalid("error parsing '", s_, "' as scalar of type ", t);
238+
}
239+
return Finish(value);
240+
}
241+
233242
Status Visit(const BinaryType&) { return FinishWithBuffer(); }
234243

235244
Status Visit(const LargeBinaryType&) { return FinishWithBuffer(); }

cpp/src/arrow/util/decimal.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,12 +180,11 @@ static constexpr int64_t kPowersOfTen[kInt64DecimalDigits + 1] = {1LL,
180180
// the appropriate power of 10 necessary to add source parsed as uint64 and
181181
// then adds the parsed value of source.
182182
static inline void ShiftAndAdd(const char* data, size_t length, Decimal128* out) {
183-
internal::StringConverter<Int64Type> converter;
184183
for (size_t posn = 0; posn < length;) {
185184
const size_t group_size = std::min(kInt64DecimalDigits, length - posn);
186185
const int64_t multiple = kPowersOfTen[group_size];
187186
int64_t chunk = 0;
188-
ARROW_CHECK(converter(data + posn, group_size, &chunk));
187+
ARROW_CHECK(internal::ParseValue<Int64Type>(data + posn, group_size, &chunk));
189188

190189
*out *= multiple;
191190
*out += chunk;
@@ -275,8 +274,7 @@ bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out)
275274
++pos;
276275
}
277276
out->has_exponent = true;
278-
internal::StringConverter<Int32Type> exponent_converter;
279-
return exponent_converter(s + pos, size - pos, &(out->exponent));
277+
return internal::ParseValue<Int32Type>(s + pos, size - pos, &(out->exponent));
280278
}
281279
return pos == size;
282280
}

0 commit comments

Comments
 (0)