Skip to content

Commit aa8bb3c

Browse files
pitrouwesm
authored andcommitted
ARROW-3986: [C++] Document memory management and table APIs
Author: Antoine Pitrou <antoine@python.org> Closes apache#3159 from pitrou/ARROW-3986-more-prose-documentation and squashes the following commits: 4e8ff42 <Antoine Pitrou> ARROW-3986: Document memory management and table APIs
1 parent a3ba1a2 commit aa8bb3c

14 files changed

Lines changed: 459 additions & 34 deletions

File tree

cpp/src/arrow/allocator.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
namespace arrow {
3131

32+
/// \brief A STL allocator delegating allocations to a Arrow MemoryPool
3233
template <class T>
3334
class stl_allocator {
3435
public:
@@ -45,7 +46,9 @@ class stl_allocator {
4546
using other = stl_allocator<U>;
4647
};
4748

49+
/// \brief Construct an allocator from the default MemoryPool
4850
stl_allocator() noexcept : pool_(default_memory_pool()) {}
51+
/// \brief Construct an allocator from the given MemoryPool
4952
explicit stl_allocator(MemoryPool* pool) noexcept : pool_(pool) {}
5053

5154
template <class U>
@@ -86,9 +89,14 @@ class stl_allocator {
8689
MemoryPool* pool_;
8790
};
8891

92+
/// \brief A MemoryPool implementation delegating allocations to a STL allocator
93+
///
94+
/// Note that STL allocators don't provide a resizing operation, and therefore
95+
/// any buffer resizes will do a full reallocation and copy.
8996
template <typename Allocator = std::allocator<uint8_t>>
9097
class STLMemoryPool : public MemoryPool {
9198
public:
99+
/// \brief Construct a memory pool from the given allocator
92100
explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {}
93101

94102
Status Allocate(int64_t size, uint8_t** out) override {

cpp/src/arrow/buffer.h

Lines changed: 55 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,15 @@ namespace arrow {
4040

4141
/// \class Buffer
4242
/// \brief Object containing a pointer to a piece of contiguous memory with a
43-
/// particular size. Base class does not own its memory
43+
/// particular size.
4444
///
4545
/// Buffers have two related notions of length: size and capacity. Size is
4646
/// the number of bytes that might have valid data. Capacity is the number
47-
/// of bytes that where allocated for the buffer in total.
47+
/// of bytes that were allocated for the buffer in total.
4848
///
49-
/// The following invariant is always true: Size < Capacity
49+
/// The Buffer base class does not own its memory, but subclasses often do.
50+
///
51+
/// The following invariant is always true: Size <= Capacity
5052
class ARROW_EXPORT Buffer {
5153
public:
5254
/// \brief Construct from buffer and size without copying memory
@@ -158,18 +160,25 @@ class ARROW_EXPORT Buffer {
158160
/// \note Can throw std::bad_alloc if buffer is large
159161
std::string ToString() const;
160162

161-
int64_t capacity() const { return capacity_; }
163+
/// \brief Return a pointer to the buffer's data
162164
const uint8_t* data() const { return data_; }
163-
165+
/// \brief Return a writable pointer to the buffer's data
166+
///
167+
/// The buffer has to be mutable. Otherwise, an assertion may be thrown
168+
/// or a null pointer may be returned.
164169
uint8_t* mutable_data() {
165170
#ifndef NDEBUG
166171
CheckMutable();
167172
#endif
168173
return mutable_data_;
169174
}
170175

176+
/// \brief Return the buffer's size in bytes
171177
int64_t size() const { return size_; }
172178

179+
/// \brief Return the buffer's capacity (number of allocated bytes)
180+
int64_t capacity() const { return capacity_; }
181+
173182
std::shared_ptr<Buffer> parent() const { return parent_; }
174183

175184
protected:
@@ -188,26 +197,38 @@ class ARROW_EXPORT Buffer {
188197
ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);
189198
};
190199

191-
/// Construct a view on passed buffer at the indicated offset and length. This
192-
/// function cannot fail and does not error checking (except in debug builds)
200+
/// \defgroup buffer-slicing-functions Functions for slicing buffers
201+
///
202+
/// @{
203+
204+
/// \brief Construct a view on a buffer at the given offset and length.
205+
///
206+
/// This function cannot fail and does not check for errors (except in debug builds)
193207
static inline std::shared_ptr<Buffer> SliceBuffer(const std::shared_ptr<Buffer>& buffer,
194208
const int64_t offset,
195209
const int64_t length) {
196210
return std::make_shared<Buffer>(buffer, offset, length);
197211
}
198212

213+
/// \brief Construct a view on a buffer at the given offset, up to the buffer's end.
214+
///
215+
/// This function cannot fail and does not check for errors (except in debug builds)
199216
static inline std::shared_ptr<Buffer> SliceBuffer(const std::shared_ptr<Buffer>& buffer,
200217
const int64_t offset) {
201218
int64_t length = buffer->size() - offset;
202219
return SliceBuffer(buffer, offset, length);
203220
}
204221

205-
/// Construct a mutable buffer slice. If the parent buffer is not mutable, this
206-
/// will abort in debug builds
222+
/// \brief Like SliceBuffer, but construct a mutable buffer slice.
223+
///
224+
/// If the parent buffer is not mutable, behavior is undefined (it may abort
225+
/// in debug builds).
207226
ARROW_EXPORT
208227
std::shared_ptr<Buffer> SliceMutableBuffer(const std::shared_ptr<Buffer>& buffer,
209228
const int64_t offset, const int64_t length);
210229

230+
/// @}
231+
211232
/// \class MutableBuffer
212233
/// \brief A Buffer whose contents can be mutated. May or may not own its data.
213234
class ARROW_EXPORT MutableBuffer : public Buffer {
@@ -266,6 +287,10 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer {
266287
ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {}
267288
};
268289

290+
/// \defgroup buffer-allocation-functions Functions for allocating buffers
291+
///
292+
/// @{
293+
269294
/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding.
270295
///
271296
/// \param[in] pool a memory pool
@@ -364,6 +389,8 @@ Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length,
364389
ARROW_EXPORT
365390
Status AllocateEmptyBitmap(int64_t length, std::shared_ptr<Buffer>* out);
366391

392+
/// @}
393+
367394
// ----------------------------------------------------------------------
368395
// Buffer builder classes
369396

@@ -374,13 +401,13 @@ class ARROW_EXPORT BufferBuilder {
374401
explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
375402
: pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {}
376403

377-
/// \brief Resizes the buffer to the nearest multiple of 64 bytes
404+
/// \brief Resize the buffer to the nearest multiple of 64 bytes
378405
///
379406
/// \param elements the new capacity of the of the builder. Will be rounded
380407
/// up to a multiple of 64 bytes for padding
381-
/// \param shrink_to_fit if new capacity smaller than existing size,
408+
/// \param shrink_to_fit if new capacity is smaller than the existing size,
382409
/// reallocate internal buffer. Set to false to avoid reallocations when
383-
/// shrinking the builder
410+
/// shrinking the builder.
384411
/// \return Status
385412
Status Resize(const int64_t elements, bool shrink_to_fit = true) {
386413
// Resize(0) is a no-op
@@ -409,6 +436,9 @@ class ARROW_EXPORT BufferBuilder {
409436
/// \return Status
410437
Status Reserve(const int64_t size) { return Resize(size_ + size, false); }
411438

439+
/// \brief Append the given data to the buffer
440+
///
441+
/// The buffer is automatically expanded if necessary.
412442
Status Append(const void* data, int64_t length) {
413443
if (capacity_ < length + size_) {
414444
int64_t new_capacity = BitUtil::NextPower2(length + size_);
@@ -418,6 +448,9 @@ class ARROW_EXPORT BufferBuilder {
418448
return Status::OK();
419449
}
420450

451+
/// \brief Append the given data to the buffer
452+
///
453+
/// The buffer is automatically expanded if necessary.
421454
template <size_t NBYTES>
422455
Status Append(const std::array<uint8_t, NBYTES>& data) {
423456
constexpr auto nbytes = static_cast<int64_t>(NBYTES);
@@ -448,6 +481,15 @@ class ARROW_EXPORT BufferBuilder {
448481
size_ += length;
449482
}
450483

484+
/// \brief Return result of builder as a Buffer object.
485+
///
486+
/// The builder is reset and can be reused afterwards.
487+
///
488+
/// \param[out] out the finalized Buffer object
489+
/// \param shrink_to_fit if the buffer size is smaller than its capacity,
490+
/// reallocate to fit more tightly in memory. Set to false to avoid
491+
/// a reallocation, at the expense of potentially more memory consumption.
492+
/// \return Status
451493
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
452494
ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
453495
*out = buffer_;
@@ -472,6 +514,7 @@ class ARROW_EXPORT BufferBuilder {
472514
int64_t size_;
473515
};
474516

517+
/// \brief A BufferBuilder subclass with convenience methods to append typed data
475518
template <typename T>
476519
class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder {
477520
public:

cpp/src/arrow/builder.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ class ARROW_EXPORT ArrayBuilder {
118118
virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
119119

120120
/// \brief Return result of builder as an Array object.
121-
/// Resets the builder except for DictionaryBuilder
121+
///
122+
/// The builder is reset except for DictionaryBuilder.
122123
///
123124
/// \param[out] out the finalized Array object
124125
/// \return Status

cpp/src/arrow/memory_pool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ class ARROW_EXPORT ProxyMemoryPool : public MemoryPool {
142142
std::unique_ptr<ProxyMemoryPoolImpl> impl_;
143143
};
144144

145+
/// Return the process-wide default memory pool.
145146
ARROW_EXPORT MemoryPool* default_memory_pool();
146147

147148
#ifdef ARROW_NO_DEFAULT_MEMORY_POOL

cpp/src/arrow/table.h

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,12 @@ class ARROW_EXPORT ChunkedArray {
8585

8686
std::shared_ptr<DataType> type() const { return type_; }
8787

88+
/// \brief Determine if two chunked arrays are equal.
89+
///
90+
/// Two chunked arrays can be equal only if they have equal datatypes.
91+
/// However, they may be equal even if they have different chunkings.
8892
bool Equals(const ChunkedArray& other) const;
93+
/// \brief Determine if two chunked arrays are equal.
8994
bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
9095

9196
protected:
@@ -103,13 +108,26 @@ class ARROW_EXPORT ChunkedArray {
103108
/// metadata) and a chunked data array
104109
class ARROW_EXPORT Column {
105110
public:
111+
/// \brief Construct a column from a vector of arrays
112+
///
113+
/// The array chunks' datatype must match the field's datatype.
106114
Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks);
115+
/// \brief Construct a column from a chunked array
116+
///
117+
/// The chunked array's datatype must match the field's datatype.
107118
Column(const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data);
108-
119+
/// \brief Construct a column from a single array
120+
///
121+
/// The array's datatype must match the field's datatype.
109122
Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data);
110123

111-
// Construct from name and array
124+
/// \brief Construct a column from a name and an array
125+
///
126+
/// A field with the given name and the array's datatype is automatically created.
112127
Column(const std::string& name, const std::shared_ptr<Array>& data);
128+
/// \brief Construct a column from a name and a chunked array
129+
///
130+
/// A field with the given name and the array's datatype is automatically created.
113131
Column(const std::string& name, const std::shared_ptr<ChunkedArray>& data);
114132

115133
int64_t length() const { return data_->length(); }
@@ -154,7 +172,12 @@ class ARROW_EXPORT Column {
154172
/// \param[out] out The resulting vector of arrays
155173
Status Flatten(MemoryPool* pool, std::vector<std::shared_ptr<Column>>* out) const;
156174

175+
/// \brief Determine if two columns are equal.
176+
///
177+
/// Two columns can be equal only if they have equal datatypes.
178+
/// However, they may be equal even if they have different chunkings.
157179
bool Equals(const Column& other) const;
180+
/// \brief Determine if the two columns are equal.
158181
bool Equals(const std::shared_ptr<Column>& other) const;
159182

160183
/// \brief Verify that the column's array data is consistent with the passed
@@ -214,11 +237,10 @@ class ARROW_EXPORT Table {
214237
const std::vector<std::shared_ptr<RecordBatch>>& batches,
215238
std::shared_ptr<Table>* table);
216239

217-
/// \return the table's schema
240+
/// Return the table schema
218241
std::shared_ptr<Schema> schema() const { return schema_; }
219242

220-
/// \param[in] i column index, does not boundscheck
221-
/// \return the i-th column
243+
/// Return a column by index
222244
virtual std::shared_ptr<Column> column(int i) const = 0;
223245

224246
/// \brief Remove column from the table, producing a new Table
@@ -250,13 +272,16 @@ class ARROW_EXPORT Table {
250272
/// \brief Perform any checks to validate the input arguments
251273
virtual Status Validate() const = 0;
252274

253-
/// \return the number of columns in the table
275+
/// \brief Return the number of columns in the table
254276
int num_columns() const { return schema_->num_fields(); }
255277

256-
/// \return the number of rows (the corresponding length of each column)
278+
/// \brief Return the number of rows (equal to each column's logical length)
257279
int64_t num_rows() const { return num_rows_; }
258280

259-
/// \brief Determine if semantic contents of tables are exactly equal
281+
/// \brief Determine if tables are equal
282+
///
283+
/// Two tables can be equal only if they have equal schemas.
284+
/// However, they may be equal even if they have different chunkings.
260285
bool Equals(const Table& other) const;
261286

262287
protected:
@@ -269,18 +294,25 @@ class ARROW_EXPORT Table {
269294
ARROW_DISALLOW_COPY_AND_ASSIGN(Table);
270295
};
271296

272-
/// \brief Compute a sequence of record batches from a (possibly chunked) Table
297+
/// \brief Compute a stream of record batches from a (possibly chunked) Table
298+
///
299+
/// The conversion is zero-copy: each record batch is a view over a slice
300+
/// of the table's columns.
273301
class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
274302
public:
275303
~TableBatchReader() override;
276304

277-
/// \brief Read batches with the maximum possible size
305+
/// \brief Construct a TableBatchReader for the given table
278306
explicit TableBatchReader(const Table& table);
279307

280308
std::shared_ptr<Schema> schema() const override;
281309

282310
Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
283311

312+
/// \brief Set the desired maximum chunk size of record batches
313+
///
314+
/// The actual chunk size of each record batch may be smaller, depending
315+
/// on actual chunking characteristics of each table column.
284316
void set_chunksize(int64_t chunksize);
285317

286318
private:
@@ -289,7 +321,10 @@ class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
289321
};
290322

291323
/// \brief Construct table from multiple input tables.
292-
/// \return Status, fails if any schemas are different
324+
///
325+
/// The tables are concatenated vertically. Therefore, all tables should
326+
/// have the same schema. Each column in the output table is the result
327+
/// of concatenating the corresponding columns in all input tables.
293328
ARROW_EXPORT
294329
Status ConcatenateTables(const std::vector<std::shared_ptr<Table>>& tables,
295330
std::shared_ptr<Table>* table);

0 commit comments

Comments
 (0)