forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecord_batch.h
More file actions
340 lines (277 loc) · 12.3 KB
/
Copy pathrecord_batch.h
File metadata and controls
340 lines (277 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/iterator.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
/// \class RecordBatch
/// \brief Collection of equal-length arrays matching a particular Schema
///
/// A record batch is table-like data structure that is semantically a sequence
/// of fields, each a contiguous Arrow array
class ARROW_EXPORT RecordBatch {
public:
virtual ~RecordBatch() = default;
/// \param[in] schema The record batch schema
/// \param[in] num_rows length of fields in the record batch. Each array
/// should have the same length as num_rows
/// \param[in] columns the record batch fields as vector of arrays
static std::shared_ptr<RecordBatch> Make(std::shared_ptr<Schema> schema,
int64_t num_rows,
std::vector<std::shared_ptr<Array>> columns);
/// \brief Construct record batch from vector of internal data structures
/// \since 0.5.0
///
/// This class is intended for internal use, or advanced users.
///
/// \param schema the record batch schema
/// \param num_rows the number of semantic rows in the record batch. This
/// should be equal to the length of each field
/// \param columns the data for the batch's columns
static std::shared_ptr<RecordBatch> Make(
std::shared_ptr<Schema> schema, int64_t num_rows,
std::vector<std::shared_ptr<ArrayData>> columns);
/// \brief Create an empty RecordBatch of a given schema
///
/// The output RecordBatch will be created with DataTypes from
/// the given schema.
///
/// \param[in] schema the schema of the empty RecordBatch
/// \param[in] pool the memory pool to allocate memory from
/// \return the resulting RecordBatch
static Result<std::shared_ptr<RecordBatch>> MakeEmpty(
std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool());
/// \brief Convert record batch to struct array
///
/// Create a struct array whose child arrays are the record batch's columns.
/// Note that the record batch's top-level field metadata cannot be reflected
/// in the resulting struct array.
Result<std::shared_ptr<StructArray>> ToStructArray() const;
/// \brief Construct record batch from struct array
///
/// This constructs a record batch using the child arrays of the given
/// array, which must be a struct array. Note that the struct array's own
/// null bitmap is not reflected in the resulting record batch.
static Result<std::shared_ptr<RecordBatch>> FromStructArray(
const std::shared_ptr<Array>& array);
/// \brief Determine if two record batches are exactly equal
///
/// \param[in] other the RecordBatch to compare with
/// \param[in] check_metadata if true, check that Schema metadata is the same
/// \return true if batches are equal
bool Equals(const RecordBatch& other, bool check_metadata = false) const;
/// \brief Determine if two record batches are approximately equal
bool ApproxEquals(const RecordBatch& other) const;
/// \return the record batch's schema
const std::shared_ptr<Schema>& schema() const { return schema_; }
/// \brief Retrieve all columns at once
virtual const std::vector<std::shared_ptr<Array>>& columns() const = 0;
/// \brief Retrieve an array from the record batch
/// \param[in] i field index, does not boundscheck
/// \return an Array object
virtual std::shared_ptr<Array> column(int i) const = 0;
/// \brief Retrieve an array from the record batch
/// \param[in] name field name
/// \return an Array or null if no field was found
std::shared_ptr<Array> GetColumnByName(const std::string& name) const;
/// \brief Retrieve an array's internal data from the record batch
/// \param[in] i field index, does not boundscheck
/// \return an internal ArrayData object
virtual std::shared_ptr<ArrayData> column_data(int i) const = 0;
/// \brief Retrieve all arrays' internal data from the record batch.
virtual const ArrayDataVector& column_data() const = 0;
/// \brief Add column to the record batch, producing a new RecordBatch
///
/// \param[in] i field index, which will be boundschecked
/// \param[in] field field to be added
/// \param[in] column column to be added
virtual Result<std::shared_ptr<RecordBatch>> AddColumn(
int i, const std::shared_ptr<Field>& field,
const std::shared_ptr<Array>& column) const = 0;
/// \brief Add new nullable column to the record batch, producing a new
/// RecordBatch.
///
/// For non-nullable columns, use the Field-based version of this method.
///
/// \param[in] i field index, which will be boundschecked
/// \param[in] field_name name of field to be added
/// \param[in] column column to be added
virtual Result<std::shared_ptr<RecordBatch>> AddColumn(
int i, std::string field_name, const std::shared_ptr<Array>& column) const;
/// \brief Replace a column in the record batch, producing a new RecordBatch
///
/// \param[in] i field index, does boundscheck
/// \param[in] field field to be replaced
/// \param[in] column column to be replaced
virtual Result<std::shared_ptr<RecordBatch>> SetColumn(
int i, const std::shared_ptr<Field>& field,
const std::shared_ptr<Array>& column) const = 0;
/// \brief Remove column from the record batch, producing a new RecordBatch
///
/// \param[in] i field index, does boundscheck
virtual Result<std::shared_ptr<RecordBatch>> RemoveColumn(int i) const = 0;
virtual std::shared_ptr<RecordBatch> ReplaceSchemaMetadata(
const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
/// \brief Name in i-th column
const std::string& column_name(int i) const;
/// \return the number of columns in the table
int num_columns() const;
/// \return the number of rows (the corresponding length of each column)
int64_t num_rows() const { return num_rows_; }
/// \brief Slice each of the arrays in the record batch
/// \param[in] offset the starting offset to slice, through end of batch
/// \return new record batch
virtual std::shared_ptr<RecordBatch> Slice(int64_t offset) const;
/// \brief Slice each of the arrays in the record batch
/// \param[in] offset the starting offset to slice
/// \param[in] length the number of elements to slice from offset
/// \return new record batch
virtual std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const = 0;
/// \return PrettyPrint representation suitable for debugging
std::string ToString() const;
/// \brief Return new record batch with specified columns
Result<std::shared_ptr<RecordBatch>> SelectColumns(
const std::vector<int>& indices) const;
/// \brief Perform cheap validation checks to determine obvious inconsistencies
/// within the record batch's schema and internal data.
///
/// This is O(k) where k is the total number of fields and array descendents.
///
/// \return Status
virtual Status Validate() const;
/// \brief Perform extensive validation checks to determine inconsistencies
/// within the record batch's schema and internal data.
///
/// This is potentially O(k*n) where n is the number of rows.
///
/// \return Status
virtual Status ValidateFull() const;
protected:
RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows);
std::shared_ptr<Schema> schema_;
int64_t num_rows_;
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch);
};
struct ARROW_EXPORT RecordBatchWithMetadata {
std::shared_ptr<RecordBatch> batch;
std::shared_ptr<KeyValueMetadata> custom_metadata;
};
/// \brief Abstract interface for reading stream of record batches
class ARROW_EXPORT RecordBatchReader {
public:
using ValueType = std::shared_ptr<RecordBatch>;
virtual ~RecordBatchReader();
/// \return the shared schema of the record batches in the stream
virtual std::shared_ptr<Schema> schema() const = 0;
/// \brief Read the next record batch in the stream. Return null for batch
/// when reaching end of stream
///
/// \param[out] batch the next loaded batch, null at end of stream
/// \return Status
virtual Status ReadNext(std::shared_ptr<RecordBatch>* batch) = 0;
virtual Result<RecordBatchWithMetadata> ReadNext() {
return Status::NotImplemented("ReadNext with custom metadata");
}
/// \brief Iterator interface
Result<std::shared_ptr<RecordBatch>> Next() {
std::shared_ptr<RecordBatch> batch;
ARROW_RETURN_NOT_OK(ReadNext(&batch));
return batch;
}
/// \brief finalize reader
virtual Status Close() { return Status::OK(); }
class RecordBatchReaderIterator {
public:
using iterator_category = std::input_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = std::shared_ptr<RecordBatch>;
using pointer = value_type const*;
using reference = value_type const&;
RecordBatchReaderIterator() : batch_(RecordBatchEnd()), reader_(NULLPTR) {}
explicit RecordBatchReaderIterator(RecordBatchReader* reader)
: batch_(RecordBatchEnd()), reader_(reader) {
Next();
}
bool operator==(const RecordBatchReaderIterator& other) const {
return batch_ == other.batch_;
}
bool operator!=(const RecordBatchReaderIterator& other) const {
return !(*this == other);
}
Result<std::shared_ptr<RecordBatch>> operator*() {
ARROW_RETURN_NOT_OK(batch_.status());
return batch_;
}
RecordBatchReaderIterator& operator++() {
Next();
return *this;
}
RecordBatchReaderIterator operator++(int) {
RecordBatchReaderIterator tmp(*this);
Next();
return tmp;
}
private:
std::shared_ptr<RecordBatch> RecordBatchEnd() {
return std::shared_ptr<RecordBatch>(NULLPTR);
}
void Next() {
if (reader_ == NULLPTR) {
batch_ = RecordBatchEnd();
return;
}
batch_ = reader_->Next();
}
Result<std::shared_ptr<RecordBatch>> batch_;
RecordBatchReader* reader_;
};
/// \brief Return an iterator to the first record batch in the stream
RecordBatchReaderIterator begin() { return RecordBatchReaderIterator(this); }
/// \brief Return an iterator to the end of the stream
RecordBatchReaderIterator end() { return RecordBatchReaderIterator(); }
/// \brief Consume entire stream as a vector of record batches
Result<RecordBatchVector> ToRecordBatches();
ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToRecordBatches instead.")
Status ReadAll(RecordBatchVector* batches);
/// \brief Read all batches and concatenate as arrow::Table
Result<std::shared_ptr<Table>> ToTable();
ARROW_DEPRECATED("Deprecated in 8.0.0. Use ToTable instead.")
Status ReadAll(std::shared_ptr<Table>* table);
/// \brief Create a RecordBatchReader from a vector of RecordBatch.
///
/// \param[in] batches the vector of RecordBatch to read from
/// \param[in] schema schema to conform to. Will be inferred from the first
/// element if not provided.
static Result<std::shared_ptr<RecordBatchReader>> Make(
RecordBatchVector batches, std::shared_ptr<Schema> schema = NULLPTR);
/// \brief Create a RecordBatchReader from an Iterator of RecordBatch.
///
/// \param[in] batches an iterator of RecordBatch to read from.
/// \param[in] schema schema that each record batch in iterator will conform to.
static Result<std::shared_ptr<RecordBatchReader>> MakeFromIterator(
Iterator<std::shared_ptr<RecordBatch>> batches, std::shared_ptr<Schema> schema);
};
} // namespace arrow