Skip to content

Commit 593a08d

Browse files
romainfrancoisbkietz
authored andcommitted
ARROW-9140: [R] Zero-copy Arrow to R where possible
This makes altrep R vectors of type `INTSXP` or `REALSXP` from `arrow::Array` of type `Int32Type` / `DoubleType` that don't have any nulls: the altrep vector holds an external pointer so that the `Array` stays around, and its payload is shared. The R vector is marked as not mutable. ``` r library(arrow, warn.conflicts = FALSE) #> See arrow_info() for available features ``` create a “big” arrow Array with no nulls (just for testing purposes) ``` r a <- arrow:::Test_array_nonull_dbl_vector(1e7) ``` turn into R vector, using altrep, sharing the payload ``` r v <- a$as_vector() ``` verify it’s an altrep with the inspect method ``` r .Internal(inspect(v)) #> @7f9abf8ba470 14 REALSXP g0c0 [REF(65535)] std::shared_ptr<arrow::Array, double, NONULL> (len=10000000, ptr=0x7f9ab5c8cd18) ``` it’s marked as not mutable so check that modify -\> duplicate ``` r v[1] <- 0 #> Duplicate .Internal(inspect(v)) #> @7f9ac0000000 14 REALSXP g1c7 [MARK,REF(1)] (len=10000000, tl=0) 0,42,42,42,42,... ``` timings for double vector ``` r bench::workout({ a <- arrow:::Test_array_nonull_dbl_vector(1e7) v <- a$as_vector() .Internal(inspect(v)) v[1] <- 0 .Internal(inspect(v)) }) #> @7f9abc122190 14 REALSXP g0c0 [REF(65535)] std::shared_ptr<arrow::Array, double, NONULL> (len=10000000, ptr=0x7f9aba2109c8) #> Duplicate #> @7f9aa5c00000 14 REALSXP g1c7 [MARK,REF(1)] (len=10000000, tl=0) 0,42,42,42,42,... #> # A tibble: 5 x 3 #> exprs process real #> <bch:expr> <bch:tm> <bch:tm> #> 1 a <- arrow:::Test_array_nonull_dbl_vector(1e+07) 70.3ms 70.6ms #> 2 v <- a$as_vector() 13µs 14.3µs #> 3 .Internal(inspect(v)) 12µs 11.9µs ``` when a copy is needed, the data is copied entirely: ```r #> 4 v[1] <- 0 53.1ms 53.2ms #> 5 .Internal(inspect(v)) 20µs 22.6µs ``` timings for integer vector ``` r bench::workout({ a <- arrow:::Test_array_nonull_int_vector(1e7) v <- a$as_vector() .Internal(inspect(v)) v[1] <- 0 .Internal(inspect(v)) }) #> @7f9abc5bd780 13 INTSXP g0c0 [REF(65535)] std::shared_ptr<arrow::Array, int32, NONULL> (len=10000000, ptr=0x7f9ab8997378) #> @7f9ac0000000 14 REALSXP g1c7 [MARK,REF(1)] (len=10000000, tl=0) 0,42,42,42,42,... #> # A tibble: 5 x 3 #> exprs process real #> <bch:expr> <bch:tm> <bch:tm> #> 1 a <- arrow:::Test_array_nonull_int_vector(1e+07) 54.5ms 54.7ms #> 2 v <- a$as_vector() 12µs 13.2µs #> 3 .Internal(inspect(v)) 11µs 11.3µs #> 4 v[1] <- 0 851.4ms 854.6ms #> 5 .Internal(inspect(v)) 17µs 18.8µs ``` <sup>Created on 2021-06-08 by the [reprex package](https://reprex.tidyverse.org) (v2.0.0)</sup> Closes apache#10445 from romainfrancois/ARROW_9140_zero_copy Lead-authored-by: Romain Francois <romain@rstudio.com> Co-authored-by: Romain François <romain@rstudio.com> Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com> Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
1 parent 8827978 commit 593a08d

7 files changed

Lines changed: 341 additions & 0 deletions

File tree

r/R/arrowExports.R

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/data-raw/codegen.R

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,11 @@ glue::glue('\n
214214
'extern "C" void R_init_arrow(DllInfo* dll){
215215
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
216216
R_useDynamicSymbols(dll, FALSE);
217+
218+
#if defined(HAS_ALTREP)
219+
arrow::r::Init_Altrep_classes(dll);
220+
#endif
221+
217222
}
218223
\n')
219224

r/src/altrep.cpp

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <cpp11/altrep.hpp>
19+
20+
#include "./arrow_types.h"
21+
22+
#if defined(HAS_ALTREP)
23+
24+
#include <R_ext/Altrep.h>
25+
#include <arrow/array.h>
26+
27+
namespace arrow {
28+
namespace r {
29+
30+
template <int sexp_type>
31+
struct ArrayNoNull {
32+
using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
33+
static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
34+
using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
35+
36+
// altrep object around an Array with no nulls
37+
// data1: an external pointer to a shared pointer to the Array
38+
// data2: not used
39+
40+
static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
41+
// we don't need the whole r6 object, just an external pointer
42+
// that retain the array
43+
Pointer xp(new std::shared_ptr<Array>(array));
44+
45+
SEXP res = R_new_altrep(class_t, xp, R_NilValue);
46+
MARK_NOT_MUTABLE(res);
47+
48+
return res;
49+
}
50+
51+
static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
52+
void (*inspect_subtree)(SEXP, int, int, int)) {
53+
const auto& array = Get(x);
54+
Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
55+
array->type()->ToString().c_str(), array->length(), array.get());
56+
inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
57+
return TRUE;
58+
}
59+
60+
static const std::shared_ptr<Array>& Get(SEXP vec) {
61+
return *Pointer(R_altrep_data1(vec));
62+
}
63+
64+
static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
65+
66+
static const void* Dataptr_or_null(SEXP vec) {
67+
return Get(vec)->data()->template GetValues<data_type>(1);
68+
}
69+
70+
static SEXP Duplicate(SEXP vec, Rboolean) {
71+
const auto& array = Get(vec);
72+
auto size = array->length();
73+
74+
SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
75+
76+
memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
77+
78+
UNPROTECT(1);
79+
return copy;
80+
}
81+
82+
static void* Dataptr(SEXP vec, Rboolean writeable) {
83+
return const_cast<void*>(Dataptr_or_null(vec));
84+
}
85+
86+
// by definition, there are no NA
87+
static int No_NA(SEXP vec) { return 1; }
88+
89+
static void Init(R_altrep_class_t class_t, DllInfo* dll) {
90+
// altrep
91+
R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
92+
R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
93+
R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
94+
95+
// altvec
96+
R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
97+
R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
98+
}
99+
};
100+
101+
struct DoubleArrayNoNull {
102+
static R_altrep_class_t class_t;
103+
104+
static void Init(DllInfo* dll) {
105+
class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
106+
ArrayNoNull<REALSXP>::Init(class_t, dll);
107+
R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
108+
}
109+
110+
static SEXP Make(const std::shared_ptr<Array>& array) {
111+
return ArrayNoNull<REALSXP>::Make(class_t, array);
112+
}
113+
};
114+
115+
struct Int32ArrayNoNull {
116+
static R_altrep_class_t class_t;
117+
118+
static void Init(DllInfo* dll) {
119+
class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
120+
ArrayNoNull<INTSXP>::Init(class_t, dll);
121+
R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
122+
}
123+
124+
static SEXP Make(const std::shared_ptr<Array>& array) {
125+
return ArrayNoNull<INTSXP>::Make(class_t, array);
126+
}
127+
};
128+
129+
R_altrep_class_t Int32ArrayNoNull::class_t;
130+
R_altrep_class_t DoubleArrayNoNull::class_t;
131+
132+
void Init_Altrep_classes(DllInfo* dll) {
133+
DoubleArrayNoNull::Init(dll);
134+
Int32ArrayNoNull::Init(dll);
135+
}
136+
137+
SEXP MakeDoubleArrayNoNull(const std::shared_ptr<Array>& array) {
138+
return DoubleArrayNoNull::Make(array);
139+
}
140+
141+
SEXP MakeInt32ArrayNoNull(const std::shared_ptr<Array>& array) {
142+
return Int32ArrayNoNull::Make(array);
143+
}
144+
145+
} // namespace r
146+
} // namespace arrow
147+
148+
#endif
149+
150+
// [[arrow::export]]
151+
bool is_altrep_int_nonull(SEXP x) {
152+
#if defined(HAS_ALTREP)
153+
return R_altrep_inherits(x, arrow::r::Int32ArrayNoNull::class_t);
154+
#else
155+
return false;
156+
#endif
157+
}
158+
159+
// [[arrow::export]]
160+
bool is_altrep_dbl_nonull(SEXP x) {
161+
#if defined(HAS_ALTREP)
162+
return R_altrep_inherits(x, arrow::r::DoubleArrayNoNull::class_t);
163+
#else
164+
return false;
165+
#endif
166+
}

r/src/array_to_vector.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <arrow/util/parallel.h>
2929
#include <arrow/util/task_group.h>
3030

31+
#include <cpp11/altrep.hpp>
3132
#include <type_traits>
3233

3334
namespace arrow {
@@ -143,6 +144,24 @@ Status IngestSome(const std::shared_ptr<arrow::Array>& array, R_xlen_t n,
143144
// Allocate + Ingest
144145
SEXP ArrayVector__as_vector(R_xlen_t n, const std::shared_ptr<DataType>& type,
145146
const ArrayVector& arrays) {
147+
#if defined(HAS_ALTREP)
148+
// special case when there is only one array
149+
if (arrays.size() == 1) {
150+
const auto& array = arrays[0];
151+
if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0 &&
152+
array->null_count() == 0) {
153+
switch (type->id()) {
154+
case arrow::Type::DOUBLE:
155+
return arrow::r::MakeDoubleArrayNoNull(array);
156+
case arrow::Type::INT32:
157+
return arrow::r::MakeInt32ArrayNoNull(array);
158+
default:
159+
break;
160+
}
161+
}
162+
}
163+
#endif
164+
146165
auto converter = Converter::Make(type, arrays);
147166
SEXP data = PROTECT(converter->Allocate(n));
148167
StopIfNotOk(converter->IngestSerial(data));
@@ -1280,6 +1299,10 @@ SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array) {
12801299

12811300
// [[arrow::export]]
12821301
SEXP ChunkedArray__as_vector(const std::shared_ptr<arrow::ChunkedArray>& chunked_array) {
1302+
if (chunked_array->num_chunks() == 1) {
1303+
return Array__as_vector(chunked_array->chunk(0));
1304+
}
1305+
12831306
return arrow::r::ArrayVector__as_vector(chunked_array->length(), chunked_array->type(),
12841307
chunked_array->chunks());
12851308
}

r/src/arrowExports.cpp

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/arrow_types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,12 @@ arrow::Status InferSchemaFromDots(SEXP lst, SEXP schema_sxp, int num_fields,
165165
arrow::Status AddMetadataFromDots(SEXP lst, int num_fields,
166166
std::shared_ptr<arrow::Schema>& schema);
167167

168+
#if defined(HAS_ALTREP)
169+
void Init_Altrep_classes(DllInfo* dll);
170+
SEXP MakeInt32ArrayNoNull(const std::shared_ptr<Array>& array);
171+
SEXP MakeDoubleArrayNoNull(const std::shared_ptr<Array>& array);
172+
#endif
173+
168174
} // namespace r
169175
} // namespace arrow
170176

0 commit comments

Comments
 (0)