Skip to content

Commit a66ab10

Browse files
bkietzwesm
authored andcommitted
ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings
Serialization is implemented by converting Expressions to Arrays then writing a tiny IPC file. This is a ridiculous way to serialize Expressions but it should be acceptable since these classes are destined to be replaced by `arrow::engine::Expr` Closes apache#7026 from bkietz/7391-Remove-unnecessary-classe Authored-by: Benjamin Kietzman <bengilgit@gmail.com> Signed-off-by: Wes McKinney <wesm+git@apache.org>
1 parent fb4d57a commit a66ab10

42 files changed

Lines changed: 949 additions & 956 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cpp/src/arrow/array.cc

Lines changed: 125 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,99 @@ int64_t ArrayData::GetNullCount() const {
132132

133133
int64_t Array::null_count() const { return data_->GetNullCount(); }
134134

135+
namespace internal {
136+
137+
struct ScalarFromArraySlotImpl {
138+
template <typename T>
139+
using ScalarType = typename TypeTraits<T>::ScalarType;
140+
141+
Status Visit(const NullArray& a) {
142+
out_ = std::make_shared<NullScalar>();
143+
return Status::OK();
144+
}
145+
146+
Status Visit(const BooleanArray& a) { return Finish(a.Value(index_)); }
147+
148+
template <typename T>
149+
Status Visit(const NumericArray<T>& a) {
150+
return Finish(a.Value(index_));
151+
}
152+
153+
template <typename T>
154+
Status Visit(const BaseBinaryArray<T>& a) {
155+
return Finish(a.GetString(index_));
156+
}
157+
158+
Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }
159+
160+
Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); }
161+
162+
template <typename T>
163+
Status Visit(const BaseListArray<T>& a) {
164+
return Finish(a.value_slice(index_));
165+
}
166+
167+
Status Visit(const FixedSizeListArray& a) { return Finish(a.value_slice(index_)); }
168+
169+
Status Visit(const StructArray& a) {
170+
ScalarVector children;
171+
for (const auto& child : a.fields()) {
172+
children.emplace_back();
173+
ARROW_ASSIGN_OR_RAISE(children.back(), child->GetScalar(index_));
174+
}
175+
return Finish(std::move(children));
176+
}
177+
178+
Status Visit(const UnionArray& a) {
179+
return Status::NotImplemented("Non-null UnionScalar");
180+
}
181+
182+
Status Visit(const DictionaryArray& a) {
183+
ARROW_ASSIGN_OR_RAISE(auto value, a.dictionary()->GetScalar(a.GetValueIndex(index_)));
184+
return Finish(std::move(value));
185+
}
186+
187+
Status Visit(const ExtensionArray& a) {
188+
return Status::NotImplemented("Non-null ExtensionScalar");
189+
}
190+
191+
template <typename Arg>
192+
Status Finish(Arg&& arg) {
193+
return MakeScalar(array_.type(), std::forward<Arg>(arg)).Value(&out_);
194+
}
195+
196+
Status Finish(std::string arg) {
197+
return MakeScalar(array_.type(), Buffer::FromString(std::move(arg))).Value(&out_);
198+
}
199+
200+
Result<std::shared_ptr<Scalar>> Finish() && {
201+
if (index_ >= array_.length()) {
202+
return Status::IndexError("tried to refer to element ", index_,
203+
" but array is only ", array_.length(), " long");
204+
}
205+
206+
if (array_.IsNull(index_)) {
207+
return MakeNullScalar(array_.type());
208+
}
209+
210+
RETURN_NOT_OK(VisitArrayInline(array_, this));
211+
return std::move(out_);
212+
}
213+
214+
ScalarFromArraySlotImpl(const Array& array, int64_t index)
215+
: array_(array), index_(index) {}
216+
217+
const Array& array_;
218+
int64_t index_;
219+
std::shared_ptr<Scalar> out_;
220+
};
221+
222+
} // namespace internal
223+
224+
Result<std::shared_ptr<Scalar>> Array::GetScalar(int64_t i) const {
225+
return internal::ScalarFromArraySlotImpl{*this, i}.Finish();
226+
}
227+
135228
std::string Array::Diff(const Array& other) const {
136229
std::stringstream diff;
137230
ARROW_IGNORE_EXPR(Equals(other, EqualOptions().diff_sink(&diff)));
@@ -363,20 +456,20 @@ Result<std::shared_ptr<Array>> FlattenListArray(const ListArrayT& list_array,
363456

364457
} // namespace
365458

366-
ListArray::ListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
459+
ListArray::ListArray(std::shared_ptr<ArrayData> data) { SetData(std::move(data)); }
367460

368461
LargeListArray::LargeListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
369462

370-
ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
371-
const std::shared_ptr<Buffer>& value_offsets,
372-
const std::shared_ptr<Array>& values,
373-
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
463+
ListArray::ListArray(std::shared_ptr<DataType> type, int64_t length,
464+
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
465+
std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
374466
int64_t offset) {
375467
ARROW_CHECK_EQ(type->id(), Type::LIST);
376-
auto internal_data =
377-
ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
468+
auto internal_data = ArrayData::Make(
469+
std::move(type), length,
470+
BufferVector{std::move(null_bitmap), std::move(value_offsets)}, null_count, offset);
378471
internal_data->child_data.emplace_back(values->data());
379-
SetData(internal_data);
472+
SetData(std::move(internal_data));
380473
}
381474

382475
LargeListArray::LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
@@ -788,6 +881,13 @@ const StructType* StructArray::struct_type() const {
788881
return checked_cast<const StructType*>(data_->type.get());
789882
}
790883

884+
const ArrayVector& StructArray::fields() const {
885+
for (int i = 0; i < num_fields(); ++i) {
886+
(void)field(i);
887+
}
888+
return boxed_fields_;
889+
}
890+
791891
std::shared_ptr<Array> StructArray::field(int i) const {
792892
std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]);
793893
if (!result) {
@@ -1034,6 +1134,23 @@ Status ValidateDictionaryIndices(const std::shared_ptr<Array>& indices,
10341134

10351135
std::shared_ptr<Array> DictionaryArray::indices() const { return indices_; }
10361136

1137+
int64_t DictionaryArray::GetValueIndex(int64_t i) const {
1138+
switch (indices_->type_id()) {
1139+
case Type::INT8:
1140+
return checked_cast<const Int8Array&>(*indices_).Value(i);
1141+
case Type::INT16:
1142+
return checked_cast<const Int16Array&>(*indices_).Value(i);
1143+
case Type::INT32:
1144+
return checked_cast<const Int32Array&>(*indices_).Value(i);
1145+
case Type::INT64:
1146+
return checked_cast<const Int64Array&>(*indices_).Value(i);
1147+
default:
1148+
break;
1149+
}
1150+
ARROW_CHECK(false) << "unreachable";
1151+
return -1;
1152+
}
1153+
10371154
DictionaryArray::DictionaryArray(const std::shared_ptr<ArrayData>& data)
10381155
: dict_type_(checked_cast<const DictionaryType*>(data->type.get())) {
10391156
ARROW_CHECK_EQ(data->type->id(), Type::DICTIONARY);

cpp/src/arrow/array.h

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,9 @@ class ARROW_EXPORT Array {
323323
BitUtil::GetBit(null_bitmap_data_, i + data_->offset);
324324
}
325325

326+
/// \brief Return a Scalar containing the value of this array at i
327+
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
328+
326329
/// Size in the number of elements this array contains.
327330
int64_t length() const { return data_->length; }
328331

@@ -615,12 +618,11 @@ class BaseListArray : public Array {
615618
/// Concrete Array class for list data
616619
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
617620
public:
618-
explicit ListArray(const std::shared_ptr<ArrayData>& data);
621+
explicit ListArray(std::shared_ptr<ArrayData> data);
619622

620-
ListArray(const std::shared_ptr<DataType>& type, int64_t length,
621-
const std::shared_ptr<Buffer>& value_offsets,
622-
const std::shared_ptr<Array>& values,
623-
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
623+
ListArray(std::shared_ptr<DataType> type, int64_t length,
624+
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
625+
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
624626
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
625627

626628
/// \brief Construct ListArray from array of offsets and child value array
@@ -1081,6 +1083,8 @@ class ARROW_EXPORT StructArray : public Array {
10811083
// count adjusted.
10821084
std::shared_ptr<Array> field(int pos) const;
10831085

1086+
const ArrayVector& fields() const;
1087+
10841088
/// Returns null if name not found
10851089
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
10861090

@@ -1095,7 +1099,7 @@ class ARROW_EXPORT StructArray : public Array {
10951099
private:
10961100
// For caching boxed child data
10971101
// XXX This is not handled in a thread-safe manner.
1098-
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
1102+
mutable ArrayVector boxed_fields_;
10991103
};
11001104

11011105
// ----------------------------------------------------------------------
@@ -1369,6 +1373,9 @@ class ARROW_EXPORT DictionaryArray : public Array {
13691373
std::shared_ptr<Array> dictionary() const;
13701374
std::shared_ptr<Array> indices() const;
13711375

1376+
/// \brief Return the ith value of indices, cast to int64_t
1377+
int64_t GetValueIndex(int64_t i) const;
1378+
13721379
const DictionaryType* dict_type() const { return dict_type_; }
13731380

13741381
private:

cpp/src/arrow/dataset/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ normalization are also addressed.
2828

2929
## Development Status
3030

31-
Pre-alpha as of June 2019. API subject to change without notice.
31+
Alpha/beta stage as of April 2020. API subject to change, possibly
32+
without deprecation notices.

cpp/src/arrow/dataset/api.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
// This API is EXPERIMENTAL.
19+
1820
#pragma once
1921

2022
#include "arrow/dataset/dataset.h"

cpp/src/arrow/dataset/dataset.cc

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ namespace arrow {
3131
namespace dataset {
3232

3333
Fragment::Fragment(std::shared_ptr<Expression> partition_expression)
34-
: partition_expression_(partition_expression ? partition_expression : scalar(true)) {}
34+
: partition_expression_(std::move(partition_expression)) {
35+
DCHECK_NE(partition_expression_, nullptr);
36+
}
3537

3638
Result<std::shared_ptr<Schema>> InMemoryFragment::ReadPhysicalSchema() { return schema_; }
3739

@@ -70,6 +72,12 @@ Result<ScanTaskIterator> InMemoryFragment::Scan(std::shared_ptr<ScanOptions> opt
7072
return MakeMapIterator(fn, std::move(batches_it));
7173
}
7274

75+
Dataset::Dataset(std::shared_ptr<Schema> schema,
76+
std::shared_ptr<Expression> partition_expression)
77+
: schema_(std::move(schema)), partition_expression_(std::move(partition_expression)) {
78+
DCHECK_NE(partition_expression_, nullptr);
79+
}
80+
7381
Result<std::shared_ptr<ScannerBuilder>> Dataset::NewScan(
7482
std::shared_ptr<ScanContext> context) {
7583
return std::make_shared<ScannerBuilder>(this->shared_from_this(), context);
@@ -79,13 +87,8 @@ Result<std::shared_ptr<ScannerBuilder>> Dataset::NewScan() {
7987
return NewScan(std::make_shared<ScanContext>());
8088
}
8189

82-
FragmentIterator Dataset::GetFragments() { return GetFragments(scalar(true)); }
83-
8490
FragmentIterator Dataset::GetFragments(std::shared_ptr<Expression> predicate) {
85-
if (partition_expression_) {
86-
predicate = predicate->Assume(*partition_expression_);
87-
}
88-
91+
predicate = predicate->Assume(*partition_expression_);
8992
return predicate->IsSatisfiable() ? GetFragmentsImpl(std::move(predicate))
9093
: MakeEmptyIterator<std::shared_ptr<Fragment>>();
9194
}

cpp/src/arrow/dataset/dataset.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
// This API is EXPERIMENTAL.
19+
1820
#pragma once
1921

2022
#include <functional>
@@ -74,19 +76,20 @@ class ARROW_DS_EXPORT Fragment {
7476
virtual ~Fragment() = default;
7577

7678
protected:
77-
explicit Fragment(std::shared_ptr<Expression> partition_expression = NULLPTR);
79+
Fragment() = default;
80+
explicit Fragment(std::shared_ptr<Expression> partition_expression);
7881

79-
std::shared_ptr<Expression> partition_expression_;
82+
std::shared_ptr<Expression> partition_expression_ = scalar(true);
8083
};
8184

8285
/// \brief A trivial Fragment that yields ScanTask out of a fixed set of
8386
/// RecordBatch.
8487
class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
8588
public:
8689
InMemoryFragment(std::shared_ptr<Schema> schema, RecordBatchVector record_batches,
87-
std::shared_ptr<Expression> = NULLPTR);
90+
std::shared_ptr<Expression> = scalar(true));
8891
explicit InMemoryFragment(RecordBatchVector record_batches,
89-
std::shared_ptr<Expression> = NULLPTR);
92+
std::shared_ptr<Expression> = scalar(true));
9093

9194
Result<std::shared_ptr<Schema>> ReadPhysicalSchema() override;
9295

@@ -114,8 +117,7 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
114117
Result<std::shared_ptr<ScannerBuilder>> NewScan();
115118

116119
/// \brief GetFragments returns an iterator of Fragments given a predicate.
117-
FragmentIterator GetFragments(std::shared_ptr<Expression> predicate);
118-
FragmentIterator GetFragments();
120+
FragmentIterator GetFragments(std::shared_ptr<Expression> predicate = scalar(true));
119121

120122
const std::shared_ptr<Schema>& schema() const { return schema_; }
121123

@@ -140,14 +142,13 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
140142
protected:
141143
explicit Dataset(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
142144

143-
Dataset(std::shared_ptr<Schema> schema, std::shared_ptr<Expression> e)
144-
: schema_(std::move(schema)), partition_expression_(std::move(e)) {}
145-
Dataset() = default;
145+
Dataset(std::shared_ptr<Schema> schema,
146+
std::shared_ptr<Expression> partition_expression);
146147

147148
virtual FragmentIterator GetFragmentsImpl(std::shared_ptr<Expression> predicate) = 0;
148149

149150
std::shared_ptr<Schema> schema_;
150-
std::shared_ptr<Expression> partition_expression_;
151+
std::shared_ptr<Expression> partition_expression_ = scalar(true);
151152
};
152153

153154
/// \brief A Source which yields fragments wrapping a stream of record batches.

cpp/src/arrow/dataset/discovery.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
/// dataset with possible partitioning according to available
2020
/// partitioning
2121

22+
// This API is EXPERIMENTAL.
23+
2224
#pragma once
2325

2426
#include <memory>

cpp/src/arrow/dataset/file_base.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
// This API is EXPERIMENTAL.
19+
1820
#pragma once
1921

2022
#include <memory>

cpp/src/arrow/dataset/file_ipc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
// This API is EXPERIMENTAL.
19+
1820
#pragma once
1921

2022
#include <memory>

cpp/src/arrow/dataset/file_parquet.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
// This API is EXPERIMENTAL.
19+
1820
#pragma once
1921

2022
#include <memory>

0 commit comments

Comments
 (0)