forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaltrep.cpp
More file actions
1255 lines (1015 loc) · 41.3 KB
/
Copy pathaltrep.cpp
File metadata and controls
1255 lines (1015 loc) · 41.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "./arrow_types.h"
#include <arrow/array.h>
#include <arrow/chunk_resolver.h>
#include <arrow/chunked_array.h>
#include <arrow/compute/api.h>
#include <arrow/util/bitmap_reader.h>
#include <arrow/visit_data_inline.h>
#include <cpp11/altrep.hpp>
#include <cpp11/declarations.hpp>
#if defined(HAS_ALTREP)
#if R_VERSION < R_Version(3, 6, 0)
// workaround because R's <R_ext/Altrep.h> not so conveniently uses `class`
// as a variable name, and C++ is not happy about that
//
// SEXP R_new_altrep(R_altrep_class_t class, SEXP data1, SEXP data2);
//
#define class klass
// Because functions declared in <R_ext/Altrep.h> have C linkage
extern "C" {
#include <R_ext/Altrep.h>
}
// undo the workaround
#undef class
#else
#include <R_ext/Altrep.h>
#endif
#include "./r_task_group.h"
// defined in array_to_vector.cpp
SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array);
namespace arrow {
namespace r {
namespace altrep {
namespace {
template <typename c_type>
R_xlen_t Standard_Get_region(SEXP data2, R_xlen_t i, R_xlen_t n, c_type* buf);
template <>
R_xlen_t Standard_Get_region<double>(SEXP data2, R_xlen_t i, R_xlen_t n, double* buf) {
return REAL_GET_REGION(data2, i, n, buf);
}
template <>
R_xlen_t Standard_Get_region<int>(SEXP data2, R_xlen_t i, R_xlen_t n, int* buf) {
return INTEGER_GET_REGION(data2, i, n, buf);
}
template <typename T>
void DeletePointer(std::shared_ptr<T>* ptr) {
delete ptr;
}
template <typename T>
using Pointer = cpp11::external_pointer<std::shared_ptr<T>, DeletePointer<T>>;
class ArrowAltrepData {
public:
explicit ArrowAltrepData(const std::shared_ptr<ChunkedArray>& chunked_array)
: chunked_array_(chunked_array), resolver_(chunked_array->chunks()) {}
const std::shared_ptr<ChunkedArray>& chunked_array() { return chunked_array_; }
arrow::internal::ChunkLocation locate(int64_t index) {
return resolver_.Resolve(index);
}
private:
std::shared_ptr<ChunkedArray> chunked_array_;
arrow::internal::ChunkResolver resolver_;
};
// the ChunkedArray that is being wrapped by the altrep object
const std::shared_ptr<ChunkedArray>& GetChunkedArray(SEXP alt) {
auto array_data =
reinterpret_cast<ArrowAltrepData*>(R_ExternalPtrAddr(R_altrep_data1(alt)));
return array_data->chunked_array();
}
// base class for all altrep vectors
//
// data1: the Array as an external pointer; becomes NULL when
// materialization is needed.
// data2: starts as NULL, and becomes a standard R vector with the same
// data if necessary: if materialization is needed, e.g. if we need
// to access its data pointer, with DATAPTR().
template <typename Impl>
struct AltrepVectorBase {
// store the Array as an external pointer in data1, mark as immutable
static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
SEXP alt = R_new_altrep(
Impl::class_t,
external_pointer<ArrowAltrepData>(new ArrowAltrepData(chunked_array)),
R_NilValue);
MARK_NOT_MUTABLE(alt);
return alt;
}
// Is the vector materialized, i.e. does the data2 slot contain a
// standard R vector with the same data as the array.
static bool IsMaterialized(SEXP alt) { return !Rf_isNull(Impl::Representation(alt)); }
static R_xlen_t Length(SEXP alt) {
if (IsMaterialized(alt)) {
return Rf_xlength(Representation(alt));
} else {
return GetChunkedArray(alt)->length();
}
}
static int No_NA(SEXP alt) {
if (IsMaterialized(alt)) {
return false;
}
return GetChunkedArray(alt)->null_count() == 0;
}
static int Is_sorted(SEXP alt) { return UNKNOWN_SORTEDNESS; }
// What gets printed on .Internal(inspect(<the altrep object>))
static Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
void (*inspect_subtree)(SEXP, int, int, int)) {
SEXP data_class_sym = CAR(ATTRIB(ALTREP_CLASS(alt)));
const char* class_name = CHAR(PRINTNAME(data_class_sym));
if (IsMaterialized(alt)) {
Rprintf("materialized %s len=%d\n", class_name, Rf_xlength(Representation(alt)));
} else {
const auto& chunked_array = GetChunkedArray(alt);
Rprintf("%s<%p, %s, %d chunks, %d nulls> len=%d\n", class_name, chunked_array.get(),
chunked_array->type()->ToString().c_str(), chunked_array->num_chunks(),
chunked_array->null_count(), chunked_array->length());
}
return TRUE;
}
// Materialize and then duplicate data2
static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
return Rf_duplicate(Impl::Materialize(alt));
}
static SEXP Coerce(SEXP alt, int type) {
return Rf_coerceVector(Impl::Materialize(alt), type);
}
static SEXP Serialized_state(SEXP alt) { return Impl::Materialize(alt); }
static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
// default methods used when data2 is the representation
// this is overridden when data2 needs to be richer (e.g. for factors)
static SEXP Representation(SEXP alt) { return R_altrep_data2(alt); }
static void SetRepresentation(SEXP alt, SEXP x) { R_set_altrep_data2(alt, x); }
};
// altrep R vector shadowing an primitive (int or double) Array.
//
// This tries as much as possible to directly use the data
// from the Array and minimize data copies.
template <int sexp_type>
struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sexp_type>> {
using Base = AltrepVectorBase<AltrepVectorPrimitive<sexp_type>>;
// singleton altrep class description
static R_altrep_class_t class_t;
using c_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
using Base::IsMaterialized;
using Base::Representation;
using Base::SetRepresentation;
// Force materialization. After calling this, the data2 slot of the altrep
// object contains a standard R vector with the same data, with
// R sentinels where the Array has nulls. This method also releases the
// reference to the original ChunkedArray.
static SEXP Materialize(SEXP alt) {
if (!IsMaterialized(alt)) {
auto size = Base::Length(alt);
// create a standard R vector
SEXP copy = PROTECT(Rf_allocVector(sexp_type, size));
// copy the data from the array, through Get_region
Get_region(alt, 0, size, reinterpret_cast<c_type*>(DATAPTR(copy)));
// store as data2, this is now considered materialized
SetRepresentation(alt, copy);
// we no longer need the original ChunkedArray (keeping it alive uses more
// memory than is required, since our methods can now use the
// materialized array)
R_set_altrep_data1(alt, R_NilValue);
UNPROTECT(1);
}
return Representation(alt);
}
// R calls this to get a pointer to the start of the vector data
// but only if this is possible without allocating (in the R sense).
static const void* Dataptr_or_null(SEXP alt) {
// data2 has been created, and so the R sentinels are in place where the array has
// nulls
if (IsMaterialized(alt)) {
return DATAPTR_RO(Representation(alt));
}
// there is only one chunk with no nulls, we can directly return the start of its data
auto chunked_array = GetChunkedArray(alt);
if (chunked_array->num_chunks() == 1 && chunked_array->null_count() == 0) {
return reinterpret_cast<const void*>(
chunked_array->chunk(0)->data()->template GetValues<c_type>(1));
}
// Otherwise: if the array has nulls and data2 has not been generated: give up
return nullptr;
}
// R calls this to get a pointer to the start of the data, R allocations are allowed.
static void* Dataptr(SEXP alt, Rboolean writeable) {
// If the object hasn't been materialized, and the array has no
// nulls we can directly point to the array data.
if (!IsMaterialized(alt)) {
const auto& chunked_array = GetChunkedArray(alt);
if (chunked_array->num_chunks() == 1 && chunked_array->null_count() == 0) {
return reinterpret_cast<void*>(const_cast<c_type*>(
chunked_array->chunk(0)->data()->template GetValues<c_type>(1)));
}
}
// Otherwise we have to materialize and hand the pointer to data2
return DATAPTR(Materialize(alt));
}
// The value at position i
static c_type Elt(SEXP alt, R_xlen_t i) {
if (IsMaterialized(alt)) {
return reinterpret_cast<c_type*>(DATAPTR(Representation(alt)))[i];
}
auto altrep_data =
reinterpret_cast<ArrowAltrepData*>(R_ExternalPtrAddr(R_altrep_data1(alt)));
auto resolve = altrep_data->locate(i);
const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index);
auto j = resolve.index_in_chunk;
return array->IsNull(j) ? cpp11::na<c_type>()
: array->data()->template GetValues<c_type>(1)[j];
}
// R calls this when it wants data from position `i` to `i + n` copied into `buf`
// The returned value is the number of values that were really copied
// (this can be lower than n)
static R_xlen_t Get_region(SEXP alt, R_xlen_t i, R_xlen_t n, c_type* buf) {
// If we have data2, we can just copy the region into buf
// using the standard Get_region for this R type
if (IsMaterialized(alt)) {
return Standard_Get_region<c_type>(Representation(alt), i, n, buf);
}
// The vector was not materialized, aka we don't have data2
//
// In that case, we copy the data from the Array, and then
// do a second pass to force the R sentinels for where the
// array has nulls
//
// This only materializes the region into buf (not the entire vector).
auto slice = GetChunkedArray(alt)->Slice(i, n);
R_xlen_t ncopy = slice->length();
c_type* out = buf;
for (const auto& array : slice->chunks()) {
auto n_i = array->length();
// first copy the data buffer
memcpy(out, array->data()->template GetValues<c_type>(1), n_i * sizeof(c_type));
// then set the R NA sentinels if needed
if (array->null_count() > 0) {
internal::BitmapReader bitmap_reader(array->null_bitmap()->data(),
array->offset(), n_i);
for (R_xlen_t j = 0; j < n_i; j++, bitmap_reader.Next()) {
if (bitmap_reader.IsNotSet()) {
out[j] = cpp11::na<c_type>();
}
}
}
out += n_i;
}
return ncopy;
}
static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(bool na_rm) {
auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
arrow::compute::ScalarAggregateOptions::Defaults());
options->min_count = 0;
options->skip_nulls = na_rm;
return options;
}
template <bool Min>
static SEXP MinMax(SEXP alt, Rboolean narm) {
if (IsMaterialized(alt)) {
return nullptr;
}
using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
using scalar_type =
typename std::conditional<sexp_type == INTSXP, Int32Scalar, DoubleScalar>::type;
const auto& chunked_array = GetChunkedArray(alt);
bool na_rm = narm == TRUE;
auto n = chunked_array->length();
auto null_count = chunked_array->null_count();
if ((na_rm || n == 0) && null_count == n) {
if (Min) {
Rf_warning("no non-missing arguments to min; returning Inf");
return Rf_ScalarReal(R_PosInf);
} else {
Rf_warning("no non-missing arguments to max; returning -Inf");
return Rf_ScalarReal(R_NegInf);
}
}
if (!na_rm && null_count > 0) {
return cpp11::as_sexp(cpp11::na<data_type>());
}
auto options = NaRmOptions(na_rm);
const auto& minmax = ValueOrStop(
arrow::compute::CallFunction("min_max", {chunked_array}, options.get()));
const auto& minmax_scalar =
internal::checked_cast<const StructScalar&>(*minmax.scalar());
const auto& result_scalar = internal::checked_cast<const scalar_type&>(
*ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
return cpp11::as_sexp(result_scalar.value);
}
static SEXP Min(SEXP alt, Rboolean narm) { return MinMax<true>(alt, narm); }
static SEXP Max(SEXP alt, Rboolean narm) { return MinMax<false>(alt, narm); }
static SEXP Sum(SEXP alt, Rboolean narm) {
if (IsMaterialized(alt)) {
return nullptr;
}
using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
const auto& chunked_array = GetChunkedArray(alt);
bool na_rm = narm == TRUE;
auto null_count = chunked_array->null_count();
if (!na_rm && null_count > 0) {
return cpp11::as_sexp(cpp11::na<data_type>());
}
auto options = NaRmOptions(na_rm);
const auto& sum =
ValueOrStop(arrow::compute::CallFunction("sum", {chunked_array}, options.get()));
if (sexp_type == INTSXP) {
// When calling the "sum" function on an int32 array, we get an Int64 scalar
// in case of overflow, make it a double like R
int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
if (value <= INT32_MIN || value > INT32_MAX) {
return Rf_ScalarReal(static_cast<double>(value));
} else {
return Rf_ScalarInteger(static_cast<int>(value));
}
} else {
return Rf_ScalarReal(
internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
}
}
};
template <int sexp_type>
R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
// singleton altrep class description
static R_altrep_class_t class_t;
using Base = AltrepVectorBase<AltrepFactor>;
using Base::IsMaterialized;
static R_xlen_t Length(SEXP alt) {
if (IsMaterialized(alt)) {
return Rf_xlength(Representation(alt));
} else {
return GetChunkedArray(alt)->length();
}
}
// redefining because data2 is a paired list with the representation as the
// first node: the CAR
static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
static void SetRepresentation(SEXP alt, SEXP x) { SETCAR(R_altrep_data2(alt), x); }
// The CADR(data2) is used to store the transposed arrays when unification is needed
// In that case we store a vector of Buffers
using BufferVector = std::vector<std::shared_ptr<Buffer>>;
static bool WasUnified(SEXP alt) { return !Rf_isNull(CADR(R_altrep_data2(alt))); }
static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
return (*arrays)[i];
}
static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
// only dealing with dictionaries of strings
if (internal::checked_cast<const DictionaryArray&>(*chunked_array->chunk(0))
.dictionary()
->type_id() != Type::STRING) {
return R_NilValue;
}
bool need_unification = DictionaryChunkArrayNeedUnification(chunked_array);
std::shared_ptr<Array> dictionary;
SEXP pointer_arrays_transpose;
if (need_unification) {
const auto& arr_type =
internal::checked_cast<const DictionaryType&>(*chunked_array->type());
std::unique_ptr<arrow::DictionaryUnifier> unifier_ =
ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
size_t n_arrays = chunked_array->num_chunks();
BufferVector arrays_transpose(n_arrays);
for (size_t i = 0; i < n_arrays; i++) {
const auto& dict_i =
*internal::checked_cast<const DictionaryArray&>(*chunked_array->chunk(i))
.dictionary();
StopIfNotOk(unifier_->Unify(dict_i, &arrays_transpose[i]));
}
std::shared_ptr<DataType> out_type;
StopIfNotOk(unifier_->GetResult(&out_type, &dictionary));
Pointer<BufferVector> ptr(
new std::shared_ptr<BufferVector>(new BufferVector(arrays_transpose)));
pointer_arrays_transpose = PROTECT(ptr);
} else {
// just use the first one
const auto& dict_array =
internal::checked_cast<const DictionaryArray&>(*chunked_array->chunk(0));
dictionary = dict_array.dictionary();
pointer_arrays_transpose = PROTECT(R_NilValue);
}
// the chunked array as data1
SEXP data1 =
PROTECT(external_pointer<ArrowAltrepData>(new ArrowAltrepData(chunked_array)));
// a pairlist with the representation in the first node
SEXP data2 = PROTECT(Rf_list2(R_NilValue, // representation, empty at first
pointer_arrays_transpose));
SEXP alt = PROTECT(R_new_altrep(class_t, data1, data2));
MARK_NOT_MUTABLE(alt);
// set factor attributes
Rf_setAttrib(alt, R_LevelsSymbol, Array__as_vector(dictionary));
if (internal::checked_cast<const DictionaryType&>(*chunked_array->type()).ordered()) {
Rf_classgets(alt, arrow::r::data::classes_ordered);
} else {
Rf_classgets(alt, arrow::r::data::classes_factor);
}
UNPROTECT(4);
return alt;
}
// TODO: this is similar to the primitive Materialize
static SEXP Materialize(SEXP alt) {
if (!IsMaterialized(alt)) {
auto size = Base::Length(alt);
// create a standard R vector
SEXP copy = PROTECT(Rf_allocVector(INTSXP, size));
// copy the data from the array, through Get_region
Get_region(alt, 0, size, reinterpret_cast<int*>(DATAPTR(copy)));
// store as data2, this is now considered materialized
SetRepresentation(alt, copy);
// remove the ChunkedArray reference
R_set_altrep_data1(alt, R_NilValue);
UNPROTECT(1);
}
return Representation(alt);
}
static const void* Dataptr_or_null(SEXP alt) {
if (IsMaterialized(alt)) {
return DATAPTR_RO(Representation(alt));
}
return nullptr;
}
static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); }
static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
// the representation integer vector
SEXP dup = PROTECT(Rf_lazy_duplicate(Materialize(alt)));
// additional attributes from the altrep
SEXP atts = PROTECT(Rf_duplicate(ATTRIB(alt)));
SET_ATTRIB(dup, atts);
UNPROTECT(2);
return dup;
}
// The value at position i
static int Elt(SEXP alt, R_xlen_t i) {
if (Base::IsMaterialized(alt)) {
return INTEGER_ELT(Representation(alt), i);
}
auto altrep_data =
reinterpret_cast<ArrowAltrepData*>(R_ExternalPtrAddr(R_altrep_data1(alt)));
auto resolve = altrep_data->locate(i);
const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index);
auto j = resolve.index_in_chunk;
if (!array->IsNull(j)) {
const auto& indices =
internal::checked_cast<const DictionaryArray&>(*array).indices();
if (WasUnified(alt)) {
const auto* transpose_data = reinterpret_cast<const int32_t*>(
GetArrayTransposed(alt, resolve.chunk_index)->data());
switch (indices->type_id()) {
case Type::UINT8:
return transpose_data[indices->data()->GetValues<uint8_t>(1)[j]] + 1;
case Type::INT8:
return transpose_data[indices->data()->GetValues<int8_t>(1)[j]] + 1;
case Type::UINT16:
return transpose_data[indices->data()->GetValues<uint16_t>(1)[j]] + 1;
case Type::INT16:
return transpose_data[indices->data()->GetValues<int16_t>(1)[j]] + 1;
case Type::INT32:
return transpose_data[indices->data()->GetValues<int32_t>(1)[j]] + 1;
case Type::UINT32:
return transpose_data[indices->data()->GetValues<uint32_t>(1)[j]] + 1;
case Type::INT64:
return transpose_data[indices->data()->GetValues<int64_t>(1)[j]] + 1;
case Type::UINT64:
return transpose_data[indices->data()->GetValues<uint64_t>(1)[j]] + 1;
default:
break;
}
} else {
switch (indices->type_id()) {
case Type::UINT8:
return indices->data()->GetValues<uint8_t>(1)[j] + 1;
case Type::INT8:
return indices->data()->GetValues<int8_t>(1)[j] + 1;
case Type::UINT16:
return indices->data()->GetValues<uint16_t>(1)[j] + 1;
case Type::INT16:
return indices->data()->GetValues<int16_t>(1)[j] + 1;
case Type::INT32:
return indices->data()->GetValues<int32_t>(1)[j] + 1;
case Type::UINT32:
return indices->data()->GetValues<uint32_t>(1)[j] + 1;
case Type::INT64:
return indices->data()->GetValues<int64_t>(1)[j] + 1;
case Type::UINT64:
return indices->data()->GetValues<uint64_t>(1)[j] + 1;
default:
break;
}
}
}
// not reached
return NA_INTEGER;
}
static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) {
// If we have data2, we can just copy the region into buf
// using the standard Get_region for this R type
if (Base::IsMaterialized(alt)) {
return Standard_Get_region<int>(Representation(alt), start, n, buf);
}
auto chunked_array = GetChunkedArray(alt);
// get out if there is nothing to do
auto chunked_array_size = chunked_array->length();
if (start >= chunked_array_size) return 0;
auto slice = GetChunkedArray(alt)->Slice(start, n);
if (WasUnified(alt)) {
int j = 0;
// find out which is the first chunk of the chunk array
// that is present in the slice, because the main loop
// needs to refer to the correct transpose buffers
int64_t k = 0;
for (; j < chunked_array->num_chunks(); j++) {
auto nj = chunked_array->chunk(j)->length();
if (k + nj > start) {
break;
}
k += nj;
}
int* out = buf;
for (const auto& array : slice->chunks()) {
const auto& indices =
internal::checked_cast<const DictionaryArray&>(*array).indices();
// using the transpose data for this chunk
const auto* transpose_data =
reinterpret_cast<const int32_t*>(GetArrayTransposed(alt, j)->data());
auto transpose = [transpose_data](int x) { return transpose_data[x]; };
GetRegionDispatch(array, indices, transpose, out);
out += array->length();
j++;
}
} else {
// simpler case, identity transpose
auto transpose = [](int x) { return x; };
int* out = buf;
for (const auto& array : slice->chunks()) {
const auto& indices =
internal::checked_cast<const DictionaryArray&>(*array).indices();
GetRegionDispatch(array, indices, transpose, out);
out += array->length();
}
}
return slice->length();
}
#define CALL_GET_REGION_TRANSPOSE(TYPE_CLASS) \
case TYPE_CLASS##Type::type_id: \
GetRegionTranspose<TYPE_CLASS##Type>(array, indices, \
std::forward<Transpose>(transpose), out); \
break
template <typename Transpose>
static void GetRegionDispatch(const std::shared_ptr<Array>& array,
const std::shared_ptr<Array>& indices,
Transpose&& transpose, int* out) {
switch (indices->type_id()) {
ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_GET_REGION_TRANSPOSE);
default:
break;
}
}
template <typename Type, typename Transpose>
static void GetRegionTranspose(const std::shared_ptr<Array>& array,
const std::shared_ptr<Array>& indices,
Transpose transpose, int* out) {
using index_type = typename Type::c_type;
VisitArraySpanInline<Type>(
*array->data(),
/*valid_func=*/[&](index_type index) { *out++ = transpose(index) + 1; },
/*null_func=*/[&]() { *out++ = cpp11::na<int>(); });
}
static SEXP Min(SEXP alt, Rboolean narm) { return nullptr; }
static SEXP Max(SEXP alt, Rboolean narm) { return nullptr; }
static SEXP Sum(SEXP alt, Rboolean narm) { return nullptr; }
};
R_altrep_class_t AltrepFactor::class_t;
// Implementation for string arrays
template <typename Type>
struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> {
using Base = AltrepVectorBase<AltrepVectorString<Type>>;
static R_altrep_class_t class_t;
using StringArrayType = typename TypeTraits<Type>::ArrayType;
using Base::Representation;
using Base::SetRepresentation;
static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
string_viewer().set_strip_out_nuls(GetBoolOption("arrow.skip_nul", false));
return Base::Make(chunked_array);
}
// Helper class to convert to R strings. We declare one of these for the
// class to avoid having to stack-allocate one for every STRING_ELT call.
struct RStringViewer {
RStringViewer() : strip_out_nuls_(false), nul_was_stripped_(false) {}
void reset_null_was_stripped() { nul_was_stripped_ = false; }
void set_strip_out_nuls(bool strip_out_nuls) { strip_out_nuls_ = strip_out_nuls; }
// convert the i'th string of the Array to an R string (CHARSXP)
SEXP Convert(size_t i) {
if (array_->IsNull(i)) {
return NA_STRING;
}
view_ = string_array_->GetView(i);
bool no_nul = std::find(view_.begin(), view_.end(), '\0') == view_.end();
if (no_nul) {
return Rf_mkCharLenCE(view_.data(), view_.size(), CE_UTF8);
} else if (strip_out_nuls_) {
return ConvertStripNul();
} else {
Error();
// not reached
return R_NilValue;
}
}
// strip the nuls and then convert to R string
SEXP ConvertStripNul() {
const char* old_string = view_.data();
size_t stripped_len = 0, nul_count = 0;
for (size_t i = 0; i < view_.size(); i++) {
if (old_string[i] == '\0') {
++nul_count;
if (nul_count == 1) {
// first nul spotted: allocate stripped string storage
stripped_string_.assign(view_.begin(), view_.end());
stripped_len = i;
}
// don't copy old_string[i] (which is \0) into stripped_string
continue;
}
if (nul_count > 0) {
stripped_string_[stripped_len++] = old_string[i];
}
}
nul_was_stripped_ = true;
return Rf_mkCharLenCE(stripped_string_.data(), stripped_len, CE_UTF8);
}
bool nul_was_stripped() const { return nul_was_stripped_; }
// throw R error about embedded nul
void Error() {
stripped_string_ = "embedded nul in string: '";
for (char c : view_) {
if (c) {
stripped_string_ += c;
} else {
stripped_string_ += "\\0";
}
}
stripped_string_ +=
"'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul "
"= TRUE)";
Rf_error(stripped_string_.c_str());
}
void SetArray(const std::shared_ptr<Array>& array) {
array_ = array;
string_array_ = internal::checked_cast<const StringArrayType*>(array.get());
}
std::shared_ptr<Array> array_;
const StringArrayType* string_array_;
std::string stripped_string_;
bool strip_out_nuls_;
bool nul_was_stripped_;
std::string_view view_;
};
// Get a single string as a CHARSXP SEXP
static SEXP Elt(SEXP alt, R_xlen_t i) {
if (Base::IsMaterialized(alt)) {
return STRING_ELT(Representation(alt), i);
}
auto altrep_data =
reinterpret_cast<ArrowAltrepData*>(R_ExternalPtrAddr(R_altrep_data1(alt)));
auto resolve = altrep_data->locate(i);
const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index);
auto j = resolve.index_in_chunk;
SEXP s = NA_STRING;
RStringViewer& r_string_viewer = string_viewer();
r_string_viewer.SetArray(array);
// Note: we don't check GetBoolOption("arrow.skip_nul", false) here
// because it is too expensive to do so. We do set this value whenever
// an altrep string; however, there is a chance that this value could
// be out of date by the time a value in the vector is accessed.
r_string_viewer.reset_null_was_stripped();
s = r_string_viewer.Convert(j);
if (r_string_viewer.nul_was_stripped()) {
Rf_warning("Stripping '\\0' (nul) from character vector");
}
return s;
}
static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); }
static SEXP Materialize(SEXP alt) {
if (Base::IsMaterialized(alt)) {
return Representation(alt);
}
const auto& chunked_array = GetChunkedArray(alt);
SEXP data2 = PROTECT(Rf_allocVector(STRSXP, chunked_array->length()));
MARK_NOT_MUTABLE(data2);
R_xlen_t i = 0;
RStringViewer& r_string_viewer = string_viewer();
r_string_viewer.reset_null_was_stripped();
r_string_viewer.set_strip_out_nuls(GetBoolOption("arrow.skip_nul", false));
for (const auto& array : chunked_array->chunks()) {
r_string_viewer.SetArray(array);
auto ni = array->length();
for (R_xlen_t j = 0; j < ni; j++, i++) {
SET_STRING_ELT(data2, i, r_string_viewer.Convert(j));
}
}
if (r_string_viewer.nul_was_stripped()) {
Rf_warning("Stripping '\\0' (nul) from character vector");
}
// only set to data2 if all the values have been converted
SetRepresentation(alt, data2);
UNPROTECT(1); // data2
// remove reference to chunked array
R_set_altrep_data1(alt, R_NilValue);
return data2;
}
static const void* Dataptr_or_null(SEXP alt) {
if (Base::IsMaterialized(alt)) return DATAPTR(Representation(alt));
// otherwise give up
return nullptr;
}
static void Set_elt(SEXP alt, R_xlen_t i, SEXP v) {
Rf_error("ALTSTRING objects of type <arrow::array_string_vector> are immutable");
}
static RStringViewer& string_viewer() {
static RStringViewer string_viewer;
return string_viewer;
}
};
template <typename Type>
R_altrep_class_t AltrepVectorString<Type>::class_t;
// initialize altrep, altvec, altreal, and altinteger methods
template <typename AltrepClass>
void InitAltrepMethods(R_altrep_class_t class_t, DllInfo* dll) {
R_set_altrep_Length_method(class_t, AltrepClass::Length);
R_set_altrep_Inspect_method(class_t, AltrepClass::Inspect);
R_set_altrep_Duplicate_method(class_t, AltrepClass::Duplicate);
R_set_altrep_Serialized_state_method(class_t, AltrepClass::Serialized_state);
R_set_altrep_Unserialize_method(class_t, AltrepClass::Unserialize);
R_set_altrep_Coerce_method(class_t, AltrepClass::Coerce);
}
template <typename AltrepClass>
void InitAltvecMethods(R_altrep_class_t class_t, DllInfo* dll) {
R_set_altvec_Dataptr_method(class_t, AltrepClass::Dataptr);
R_set_altvec_Dataptr_or_null_method(class_t, AltrepClass::Dataptr_or_null);
}
template <typename AltrepClass>
void InitAltRealMethods(R_altrep_class_t class_t, DllInfo* dll) {
R_set_altreal_No_NA_method(class_t, AltrepClass::No_NA);
R_set_altreal_Is_sorted_method(class_t, AltrepClass::Is_sorted);
R_set_altreal_Sum_method(class_t, AltrepClass::Sum);
R_set_altreal_Min_method(class_t, AltrepClass::Min);
R_set_altreal_Max_method(class_t, AltrepClass::Max);
R_set_altreal_Elt_method(class_t, AltrepClass::Elt);
R_set_altreal_Get_region_method(class_t, AltrepClass::Get_region);
}
template <typename AltrepClass>
void InitAltIntegerMethods(R_altrep_class_t class_t, DllInfo* dll) {
R_set_altinteger_No_NA_method(class_t, AltrepClass::No_NA);
R_set_altinteger_Is_sorted_method(class_t, AltrepClass::Is_sorted);
R_set_altinteger_Sum_method(class_t, AltrepClass::Sum);
R_set_altinteger_Min_method(class_t, AltrepClass::Min);
R_set_altinteger_Max_method(class_t, AltrepClass::Max);
R_set_altinteger_Elt_method(class_t, AltrepClass::Elt);
R_set_altinteger_Get_region_method(class_t, AltrepClass::Get_region);
}
template <typename AltrepClass>
void InitAltRealClass(DllInfo* dll, const char* name) {
AltrepClass::class_t = R_make_altreal_class(name, "arrow", dll);
InitAltrepMethods<AltrepClass>(AltrepClass::class_t, dll);
InitAltvecMethods<AltrepClass>(AltrepClass::class_t, dll);
InitAltRealMethods<AltrepClass>(AltrepClass::class_t, dll);
}
template <typename AltrepClass>
void InitAltIntegerClass(DllInfo* dll, const char* name) {
AltrepClass::class_t = R_make_altinteger_class(name, "arrow", dll);
InitAltrepMethods<AltrepClass>(AltrepClass::class_t, dll);
InitAltvecMethods<AltrepClass>(AltrepClass::class_t, dll);
InitAltIntegerMethods<AltrepClass>(AltrepClass::class_t, dll);
}
template <typename AltrepClass>
void InitAltStringClass(DllInfo* dll, const char* name) {
AltrepClass::class_t = R_make_altstring_class(name, "arrow", dll);
R_set_altrep_Length_method(AltrepClass::class_t, AltrepClass::Length);
R_set_altrep_Inspect_method(AltrepClass::class_t, AltrepClass::Inspect);
R_set_altrep_Duplicate_method(AltrepClass::class_t, AltrepClass::Duplicate);
R_set_altrep_Serialized_state_method(AltrepClass::class_t,
AltrepClass::Serialized_state);
R_set_altrep_Unserialize_method(AltrepClass::class_t, AltrepClass::Unserialize);
R_set_altrep_Coerce_method(AltrepClass::class_t, AltrepClass::Coerce);
R_set_altvec_Dataptr_method(AltrepClass::class_t, AltrepClass::Dataptr);
R_set_altvec_Dataptr_or_null_method(AltrepClass::class_t, AltrepClass::Dataptr_or_null);
R_set_altstring_Elt_method(AltrepClass::class_t, AltrepClass::Elt);
R_set_altstring_Set_elt_method(AltrepClass::class_t, AltrepClass::Set_elt);
R_set_altstring_No_NA_method(AltrepClass::class_t, AltrepClass::No_NA);
R_set_altstring_Is_sorted_method(AltrepClass::class_t, AltrepClass::Is_sorted);
}