Skip to content

Commit 97879eb

Browse files
committed
ARROW-9761: [C/C++] Add experimental C stream inferface
The goal is to have a standardized ABI to communicate streams of homogeneous arrays or record batches (for example for database result sets). The trickiest part is error reporting. This proposal tries to strike a compromise between simplicity (an integer error code mapping to errno values) and expressivity (an optional description string for application-specific and context-specific details). Closes apache#8052 from pitrou/ARROW-9761-c-array-stream Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 991a55f commit 97879eb

24 files changed

Lines changed: 1229 additions & 60 deletions

cpp/src/arrow/c/abi.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,44 @@ struct ArrowArray {
6060
void* private_data;
6161
};
6262

63+
// EXPERIMENTAL: C stream interface
64+
65+
struct ArrowArrayStream {
66+
// Callback to get the stream type
67+
// (will be the same for all arrays in the stream).
68+
//
69+
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
70+
//
71+
// If successful, the ArrowSchema must be released independently from the stream.
72+
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
73+
74+
// Callback to get the next array
75+
// (if no error and the array is released, the stream has ended)
76+
//
77+
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
78+
//
79+
// If successful, the ArrowArray must be released independently from the stream.
80+
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
81+
82+
// Callback to get optional detailed error information.
83+
// This must only be called if the last stream operation failed
84+
// with a non-0 return code.
85+
//
86+
// Return value: pointer to a null-terminated character array describing
87+
// the last error, or NULL if no description is available.
88+
//
89+
// The returned pointer is only valid until the next operation on this stream
90+
// (including release).
91+
const char* (*get_last_error)(struct ArrowArrayStream*);
92+
93+
// Release callback: release the stream's own resources.
94+
// Note that arrays returned by `get_next` must be individually released.
95+
void (*release)(struct ArrowArrayStream*);
96+
97+
// Opaque producer-specific data
98+
void* private_data;
99+
};
100+
63101
#ifdef __cplusplus
64102
}
65103
#endif

cpp/src/arrow/c/bridge.cc

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "arrow/c/bridge.h"
1919

2020
#include <algorithm>
21+
#include <cerrno>
2122
#include <cstring>
2223
#include <string>
2324
#include <utility>
@@ -1501,4 +1502,197 @@ Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
15011502
return ImportRecordBatch(array, *maybe_schema);
15021503
}
15031504

1505+
//////////////////////////////////////////////////////////////////////////
1506+
// C stream export
1507+
1508+
namespace {
1509+
1510+
class ExportedArrayStream {
1511+
public:
1512+
struct PrivateData {
1513+
explicit PrivateData(std::shared_ptr<RecordBatchReader> reader)
1514+
: reader_(std::move(reader)) {}
1515+
1516+
std::shared_ptr<RecordBatchReader> reader_;
1517+
std::string last_error_;
1518+
1519+
PrivateData() = default;
1520+
ARROW_DISALLOW_COPY_AND_ASSIGN(PrivateData);
1521+
};
1522+
1523+
explicit ExportedArrayStream(struct ArrowArrayStream* stream) : stream_(stream) {}
1524+
1525+
Status GetSchema(struct ArrowSchema* out_schema) {
1526+
return ExportSchema(*reader()->schema(), out_schema);
1527+
}
1528+
1529+
Status GetNext(struct ArrowArray* out_array) {
1530+
std::shared_ptr<RecordBatch> batch;
1531+
RETURN_NOT_OK(reader()->ReadNext(&batch));
1532+
if (batch == nullptr) {
1533+
// End of stream
1534+
ArrowArrayMarkReleased(out_array);
1535+
return Status::OK();
1536+
} else {
1537+
return ExportRecordBatch(*batch, out_array);
1538+
}
1539+
}
1540+
1541+
const char* GetLastError() {
1542+
const auto& last_error = private_data()->last_error_;
1543+
return last_error.empty() ? nullptr : last_error.c_str();
1544+
}
1545+
1546+
void Release() {
1547+
if (ArrowArrayStreamIsReleased(stream_)) {
1548+
return;
1549+
}
1550+
DCHECK_NE(private_data(), nullptr);
1551+
delete private_data();
1552+
1553+
ArrowArrayStreamMarkReleased(stream_);
1554+
}
1555+
1556+
// C-compatible callbacks
1557+
1558+
static int StaticGetSchema(struct ArrowArrayStream* stream,
1559+
struct ArrowSchema* out_schema) {
1560+
ExportedArrayStream self{stream};
1561+
return self.ToCError(self.GetSchema(out_schema));
1562+
}
1563+
1564+
static int StaticGetNext(struct ArrowArrayStream* stream,
1565+
struct ArrowArray* out_array) {
1566+
ExportedArrayStream self{stream};
1567+
return self.ToCError(self.GetNext(out_array));
1568+
}
1569+
1570+
static void StaticRelease(struct ArrowArrayStream* stream) {
1571+
ExportedArrayStream{stream}.Release();
1572+
}
1573+
1574+
static const char* StaticGetLastError(struct ArrowArrayStream* stream) {
1575+
return ExportedArrayStream{stream}.GetLastError();
1576+
}
1577+
1578+
private:
1579+
int ToCError(const Status& status) {
1580+
if (ARROW_PREDICT_TRUE(status.ok())) {
1581+
private_data()->last_error_.clear();
1582+
return 0;
1583+
}
1584+
private_data()->last_error_ = status.ToString();
1585+
switch (status.code()) {
1586+
case StatusCode::IOError:
1587+
return EIO;
1588+
case StatusCode::NotImplemented:
1589+
return ENOSYS;
1590+
case StatusCode::OutOfMemory:
1591+
return ENOMEM;
1592+
default:
1593+
return EINVAL; // Fallback for Invalid, TypeError, etc.
1594+
}
1595+
}
1596+
1597+
PrivateData* private_data() {
1598+
return reinterpret_cast<PrivateData*>(stream_->private_data);
1599+
}
1600+
1601+
const std::shared_ptr<RecordBatchReader>& reader() { return private_data()->reader_; }
1602+
1603+
struct ArrowArrayStream* stream_;
1604+
};
1605+
1606+
} // namespace
1607+
1608+
Status ExportRecordBatchReader(std::shared_ptr<RecordBatchReader> reader,
1609+
struct ArrowArrayStream* out) {
1610+
out->get_schema = ExportedArrayStream::StaticGetSchema;
1611+
out->get_next = ExportedArrayStream::StaticGetNext;
1612+
out->get_last_error = ExportedArrayStream::StaticGetLastError;
1613+
out->release = ExportedArrayStream::StaticRelease;
1614+
out->private_data = new ExportedArrayStream::PrivateData{std::move(reader)};
1615+
return Status::OK();
1616+
}
1617+
1618+
//////////////////////////////////////////////////////////////////////////
1619+
// C stream import
1620+
1621+
namespace {
1622+
1623+
class ArrayStreamBatchReader : public RecordBatchReader {
1624+
public:
1625+
explicit ArrayStreamBatchReader(struct ArrowArrayStream* stream) {
1626+
ArrowArrayStreamMove(stream, &stream_);
1627+
DCHECK(!ArrowArrayStreamIsReleased(&stream_));
1628+
}
1629+
1630+
~ArrayStreamBatchReader() {
1631+
ArrowArrayStreamRelease(&stream_);
1632+
DCHECK(ArrowArrayStreamIsReleased(&stream_));
1633+
}
1634+
1635+
std::shared_ptr<Schema> schema() const override { return CacheSchema(); }
1636+
1637+
Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
1638+
struct ArrowArray c_array;
1639+
RETURN_NOT_OK(StatusFromCError(stream_.get_next(&stream_, &c_array)));
1640+
if (ArrowArrayIsReleased(&c_array)) {
1641+
// End of stream
1642+
batch->reset();
1643+
return Status::OK();
1644+
} else {
1645+
return ImportRecordBatch(&c_array, CacheSchema()).Value(batch);
1646+
}
1647+
}
1648+
1649+
private:
1650+
std::shared_ptr<Schema> CacheSchema() const {
1651+
if (!schema_) {
1652+
struct ArrowSchema c_schema;
1653+
ARROW_CHECK_OK(StatusFromCError(stream_.get_schema(&stream_, &c_schema)));
1654+
schema_ = ImportSchema(&c_schema).ValueOrDie();
1655+
}
1656+
return schema_;
1657+
}
1658+
1659+
Status StatusFromCError(int errno_like) const {
1660+
if (ARROW_PREDICT_TRUE(errno_like == 0)) {
1661+
return Status::OK();
1662+
}
1663+
StatusCode code;
1664+
switch (errno_like) {
1665+
case EDOM:
1666+
case EINVAL:
1667+
case ERANGE:
1668+
code = StatusCode::Invalid;
1669+
break;
1670+
case ENOMEM:
1671+
code = StatusCode::OutOfMemory;
1672+
break;
1673+
case ENOSYS:
1674+
code = StatusCode::NotImplemented;
1675+
default:
1676+
code = StatusCode::IOError;
1677+
break;
1678+
}
1679+
const char* last_error = stream_.get_last_error(&stream_);
1680+
return Status(code, last_error ? std::string(last_error) : "");
1681+
}
1682+
1683+
mutable struct ArrowArrayStream stream_;
1684+
mutable std::shared_ptr<Schema> schema_;
1685+
};
1686+
1687+
} // namespace
1688+
1689+
Result<std::shared_ptr<RecordBatchReader>> ImportRecordBatchReader(
1690+
struct ArrowArrayStream* stream) {
1691+
if (ArrowArrayStreamIsReleased(stream)) {
1692+
return Status::Invalid("Cannot import released ArrowArrayStream");
1693+
}
1694+
// XXX should we call get_schema() here to avoid crashing on error?
1695+
return std::make_shared<ArrayStreamBatchReader>(stream);
1696+
}
1697+
15041698
} // namespace arrow

cpp/src/arrow/c/bridge.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@
2929

3030
namespace arrow {
3131

32+
/// \defgroup c-data-interface Functions for working with the C data interface.
33+
///
34+
/// @{
35+
3236
/// \brief Export C++ DataType using the C data interface format.
3337
///
3438
/// The root type is considered to have empty name and metadata.
@@ -160,4 +164,34 @@ ARROW_EXPORT
160164
Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
161165
struct ArrowSchema* schema);
162166

167+
/// @}
168+
169+
/// \defgroup c-stream-interface Functions for working with the C data interface.
170+
///
171+
/// @{
172+
173+
/// \brief EXPERIMENTAL: Export C++ RecordBatchReader using the C stream interface.
174+
///
175+
/// The resulting ArrowArrayStream struct keeps the record batch reader alive
176+
/// until its release callback is called by the consumer.
177+
///
178+
/// \param[in] reader RecordBatchReader object to export
179+
/// \param[out] out C struct where to export the stream
180+
ARROW_EXPORT
181+
Status ExportRecordBatchReader(std::shared_ptr<RecordBatchReader> reader,
182+
struct ArrowArrayStream* out);
183+
184+
/// \brief EXPERIMENTAL: Import C++ RecordBatchReader from the C stream interface.
185+
///
186+
/// The ArrowArrayStream struct has its contents moved to a private object
187+
/// held alive by the resulting record batch reader.
188+
///
189+
/// \param[in,out] stream C stream interface struct
190+
/// \return Imported RecordBatchReader object
191+
ARROW_EXPORT
192+
Result<std::shared_ptr<RecordBatchReader>> ImportRecordBatchReader(
193+
struct ArrowArrayStream* stream);
194+
195+
/// @}
196+
163197
} // namespace arrow

0 commit comments

Comments
 (0)