Skip to content

Commit 1810db2

Browse files
sbinetwesm
authored andcommitted
ARROW-3036: [Go] implement array.NewSlice
This CL implements the ability to slice a given array.Interface, i.e. the Go equivalent of: ``` vs := []float64{1, 2, 3, 4} sub := vs[1:3] ``` also, to support sub-slicing, offsets' support had to be implemented inside the `array.Data` and nullbitmap types. namely, the `arrow/internal/bitutil.CountSetBits` function had to be modified to support counting bits from an arbitrary offset. this resulted in a minor (?) speed-bump: ``` name old time/op new time/op delta CountSetBits_3-8 8.16ns ± 1% 8.18ns ± 0% +0.31% (p=0.000 n=19+18) CountSetBits_32-8 7.62ns ± 1% 7.79ns ± 1% +2.24% (p=0.000 n=18+18) CountSetBits_128-8 6.80ns ± 1% 7.03ns ± 1% +3.40% (p=0.000 n=18+19) CountSetBits_1000-8 18.7ns ± 0% 17.8ns ± 1% -4.92% (p=0.000 n=16+20) CountSetBits_1024-8 16.1ns ± 0% 16.1ns ± 1% +0.44% (p=0.000 n=20+20) name old alloc/op new alloc/op delta CountSetBits_3-8 0.00B 0.00B ~ (all equal) CountSetBits_32-8 0.00B 0.00B ~ (all equal) CountSetBits_128-8 0.00B 0.00B ~ (all equal) CountSetBits_1000-8 0.00B 0.00B ~ (all equal) CountSetBits_1024-8 0.00B 0.00B ~ (all equal) name old allocs/op new allocs/op delta CountSetBits_3-8 0.00 0.00 ~ (all equal) CountSetBits_32-8 0.00 0.00 ~ (all equal) CountSetBits_128-8 0.00 0.00 ~ (all equal) CountSetBits_1000-8 0.00 0.00 ~ (all equal) CountSetBits_1024-8 0.00 0.00 ~ (all equal) ``` I believe some of that speed bump could be recoup'd (see `FIXME` in `go/arrow/internal/bitutil/bitutil.go`) @stuartcarnie PTAL. needs apache#2412. Author: Sebastien Binet <binet@cern.ch> Closes apache#2419 from sbinet/issue-3036 and squashes the following commits: 547c609 <Sebastien Binet> ARROW-3036: implement array.NewSlice 767c281 <Sebastien Binet> add support for nullbitmap with offset cb6cdbb <Sebastien Binet> consolidate List array
1 parent 72e4470 commit 1810db2

17 files changed

Lines changed: 686 additions & 216 deletions

go/arrow/array/array.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ func (a *array) DataType() arrow.DataType { return a.data.dtype }
9393
// NullN returns the number of null values in the array.
9494
func (a *array) NullN() int {
9595
if a.data.nulls < 0 {
96-
a.data.nulls = a.data.length - bitutil.CountSetBits(a.nullBitmapBytes, a.data.length)
96+
a.data.nulls = a.data.length - bitutil.CountSetBits(a.nullBitmapBytes, a.data.offset, a.data.length)
9797
}
9898
return a.data.nulls
9999
}
@@ -109,13 +109,13 @@ func (a *array) Len() int { return a.data.length }
109109
// IsNull returns true if value at index is null.
110110
// NOTE: IsNull will panic if NullBitmapBytes is not empty and 0 > i ≥ Len.
111111
func (a *array) IsNull(i int) bool {
112-
return len(a.nullBitmapBytes) != 0 && bitutil.BitIsNotSet(a.nullBitmapBytes, i)
112+
return len(a.nullBitmapBytes) != 0 && bitutil.BitIsNotSet(a.nullBitmapBytes, a.data.offset+i)
113113
}
114114

115115
// IsValid returns true if value at index is not null.
116116
// NOTE: IsValid will panic if NullBitmapBytes is not empty and 0 > i ≥ Len.
117117
func (a *array) IsValid(i int) bool {
118-
return len(a.nullBitmapBytes) == 0 || bitutil.BitIsSet(a.nullBitmapBytes, i)
118+
return len(a.nullBitmapBytes) == 0 || bitutil.BitIsSet(a.nullBitmapBytes, a.data.offset+i)
119119
}
120120

121121
func (a *array) setData(data *Data) {
@@ -184,6 +184,19 @@ func MakeFromData(data *Data) Interface {
184184
return makeArrayFn[byte(data.dtype.ID()&0x1f)](data)
185185
}
186186

187+
// NewSlice constructs a zero-copy slice of the array with the indicated
188+
// indices i and j, corresponding to array[i:j].
189+
// The returned array must be Release()'d after use.
190+
//
191+
// NewSlice panics if the slice is outside the valid range of the input array.
192+
// NewSlice panics if j < i.
193+
func NewSlice(arr Interface, i, j int64) Interface {
194+
data := NewSliceData(arr.Data(), i, j)
195+
slice := MakeFromData(data)
196+
data.Release()
197+
return slice
198+
}
199+
187200
func init() {
188201
makeArrayFn[arrow.LIST] = func(data *Data) Interface { return NewListData(data) }
189202
makeArrayFn[arrow.STRUCT] = func(data *Data) Interface { return NewStructData(data) }

go/arrow/array/array_test.go

Lines changed: 150 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,14 @@ func TestMakeFromData(t *testing.T) {
6262
{name: "timestamp", d: &testDataType{arrow.TIMESTAMP}},
6363

6464
{name: "list", d: &testDataType{arrow.LIST}, child: []*array.Data{
65-
array.NewData(&testDataType{arrow.INT64}, 0, make([]*memory.Buffer, 4), nil, 0),
66-
array.NewData(&testDataType{arrow.INT64}, 0, make([]*memory.Buffer, 4), nil, 0),
65+
array.NewData(&testDataType{arrow.INT64}, 0, make([]*memory.Buffer, 4), nil, 0, 0),
66+
array.NewData(&testDataType{arrow.INT64}, 0, make([]*memory.Buffer, 4), nil, 0, 0),
6767
}},
6868

6969
{name: "struct", d: &testDataType{arrow.STRUCT}},
7070
{name: "struct", d: &testDataType{arrow.STRUCT}, child: []*array.Data{
71-
array.NewData(&testDataType{arrow.INT64}, 0, make([]*memory.Buffer, 4), nil, 0),
72-
array.NewData(&testDataType{arrow.INT64}, 0, make([]*memory.Buffer, 4), nil, 0),
71+
array.NewData(&testDataType{arrow.INT64}, 0, make([]*memory.Buffer, 4), nil, 0, 0),
72+
array.NewData(&testDataType{arrow.INT64}, 0, make([]*memory.Buffer, 4), nil, 0, 0),
7373
}},
7474

7575
// invalid types
@@ -83,7 +83,7 @@ func TestMakeFromData(t *testing.T) {
8383
if test.size != 0 {
8484
n = test.size
8585
}
86-
data := array.NewData(test.d, 0, b[:n], test.child, 0)
86+
data := array.NewData(test.d, 0, b[:n], test.child, 0, 0)
8787

8888
if test.expPanic {
8989
assert.PanicsWithValue(t, test.expError, func() {
@@ -116,7 +116,7 @@ func TestArray_NullN(t *testing.T) {
116116
for _, test := range tests {
117117
t.Run(test.name, func(t *testing.T) {
118118
buf := memory.NewBufferBytes(test.bm)
119-
data := array.NewData(arrow.FixedWidthTypes.Boolean, test.l, []*memory.Buffer{buf, nil}, nil, test.n)
119+
data := array.NewData(arrow.FixedWidthTypes.Boolean, test.l, []*memory.Buffer{buf, nil}, nil, test.n, 0)
120120
buf.Release()
121121
ar := array.MakeFromData(data)
122122
data.Release()
@@ -126,3 +126,147 @@ func TestArray_NullN(t *testing.T) {
126126
})
127127
}
128128
}
129+
130+
func TestArraySlice(t *testing.T) {
131+
pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
132+
defer pool.AssertSize(t, 0)
133+
134+
var (
135+
valids = []bool{true, true, true, false, true, true}
136+
vs = []float64{1, 2, 3, 0, 4, 5}
137+
)
138+
139+
b := array.NewFloat64Builder(pool)
140+
defer b.Release()
141+
142+
for _, tc := range []struct {
143+
i, j int
144+
panics bool
145+
len int
146+
}{
147+
{i: 0, j: len(valids), panics: false, len: len(valids)},
148+
{i: len(valids), j: len(valids), panics: false, len: 0},
149+
{i: 0, j: 1, panics: false, len: 1},
150+
{i: 1, j: 1, panics: false, len: 0},
151+
{i: 0, j: len(valids) + 1, panics: true},
152+
{i: 2, j: 1, panics: true},
153+
{i: len(valids) + 1, j: len(valids) + 1, panics: true},
154+
} {
155+
t.Run("", func(t *testing.T) {
156+
b.AppendValues(vs, valids)
157+
158+
arr := b.NewFloat64Array()
159+
defer arr.Release()
160+
161+
if got, want := arr.Len(), len(valids); got != want {
162+
t.Fatalf("got=%d, want=%d", got, want)
163+
}
164+
165+
if tc.panics {
166+
defer func() {
167+
e := recover()
168+
if e == nil {
169+
t.Fatalf("this should have panicked, but did not")
170+
}
171+
}()
172+
}
173+
174+
slice := array.NewSlice(arr, int64(tc.i), int64(tc.j)).(*array.Float64)
175+
defer slice.Release()
176+
177+
if got, want := slice.Len(), tc.len; got != want {
178+
t.Fatalf("invalid slice length: got=%d, want=%d", got, want)
179+
}
180+
})
181+
}
182+
}
183+
184+
func TestArraySliceTypes(t *testing.T) {
185+
pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
186+
defer pool.AssertSize(t, 0)
187+
188+
valids := []bool{true, true, true, false, true, true}
189+
190+
for _, tc := range []struct {
191+
values interface{}
192+
builder array.Builder
193+
append func(b array.Builder, vs interface{})
194+
}{
195+
{
196+
values: []bool{true, false, true, false, true, false},
197+
builder: array.NewBooleanBuilder(pool),
198+
append: func(b array.Builder, vs interface{}) { b.(*array.BooleanBuilder).AppendValues(vs.([]bool), valids) },
199+
},
200+
{
201+
values: []uint8{1, 2, 3, 0, 4, 5},
202+
builder: array.NewUint8Builder(pool),
203+
append: func(b array.Builder, vs interface{}) { b.(*array.Uint8Builder).AppendValues(vs.([]uint8), valids) },
204+
},
205+
{
206+
values: []uint16{1, 2, 3, 0, 4, 5},
207+
builder: array.NewUint16Builder(pool),
208+
append: func(b array.Builder, vs interface{}) { b.(*array.Uint16Builder).AppendValues(vs.([]uint16), valids) },
209+
},
210+
{
211+
values: []uint32{1, 2, 3, 0, 4, 5},
212+
builder: array.NewUint32Builder(pool),
213+
append: func(b array.Builder, vs interface{}) { b.(*array.Uint32Builder).AppendValues(vs.([]uint32), valids) },
214+
},
215+
{
216+
values: []uint64{1, 2, 3, 0, 4, 5},
217+
builder: array.NewUint64Builder(pool),
218+
append: func(b array.Builder, vs interface{}) { b.(*array.Uint64Builder).AppendValues(vs.([]uint64), valids) },
219+
},
220+
{
221+
values: []int8{1, 2, 3, 0, 4, 5},
222+
builder: array.NewInt8Builder(pool),
223+
append: func(b array.Builder, vs interface{}) { b.(*array.Int8Builder).AppendValues(vs.([]int8), valids) },
224+
},
225+
{
226+
values: []int16{1, 2, 3, 0, 4, 5},
227+
builder: array.NewInt16Builder(pool),
228+
append: func(b array.Builder, vs interface{}) { b.(*array.Int16Builder).AppendValues(vs.([]int16), valids) },
229+
},
230+
{
231+
values: []int32{1, 2, 3, 0, 4, 5},
232+
builder: array.NewInt32Builder(pool),
233+
append: func(b array.Builder, vs interface{}) { b.(*array.Int32Builder).AppendValues(vs.([]int32), valids) },
234+
},
235+
{
236+
values: []int64{1, 2, 3, 0, 4, 5},
237+
builder: array.NewInt64Builder(pool),
238+
append: func(b array.Builder, vs interface{}) { b.(*array.Int64Builder).AppendValues(vs.([]int64), valids) },
239+
},
240+
{
241+
values: []float32{1, 2, 3, 0, 4, 5},
242+
builder: array.NewFloat32Builder(pool),
243+
append: func(b array.Builder, vs interface{}) { b.(*array.Float32Builder).AppendValues(vs.([]float32), valids) },
244+
},
245+
{
246+
values: []float64{1, 2, 3, 0, 4, 5},
247+
builder: array.NewFloat64Builder(pool),
248+
append: func(b array.Builder, vs interface{}) { b.(*array.Float64Builder).AppendValues(vs.([]float64), valids) },
249+
},
250+
} {
251+
t.Run("", func(t *testing.T) {
252+
defer tc.builder.Release()
253+
254+
b := tc.builder
255+
tc.append(b, tc.values)
256+
257+
arr := b.NewArray()
258+
defer arr.Release()
259+
260+
if got, want := arr.Len(), len(valids); got != want {
261+
t.Fatalf("invalid length: got=%d, want=%d", got, want)
262+
}
263+
264+
slice := array.NewSlice(arr, 2, 5)
265+
defer slice.Release()
266+
267+
if got, want := slice.Len(), 3; got != want {
268+
t.Fatalf("invalid slice length: got=%d, want=%d", got, want)
269+
}
270+
})
271+
}
272+
}

go/arrow/array/binarybuilder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ func (b *BinaryBuilder) NewBinaryArray() (a *Binary) {
163163
func (b *BinaryBuilder) newData() (data *Data) {
164164
b.appendNextOffset()
165165
offsets, values := b.offsets.Finish(), b.values.Finish()
166-
data = NewData(b.dtype, b.length, []*memory.Buffer{b.nullBitmap, offsets, values}, nil, b.nulls)
166+
data = NewData(b.dtype, b.length, []*memory.Buffer{b.nullBitmap, offsets, values}, nil, b.nulls, 0)
167167
if offsets != nil {
168168
offsets.Release()
169169
}

go/arrow/array/boolean.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ type Boolean struct {
3232
// The nullBitmap buffer can be nil of there are no null values.
3333
// If nulls is not known, use UnknownNullCount to calculate the value of NullN at runtime from the nullBitmap buffer.
3434
func NewBoolean(length int, data *memory.Buffer, nullBitmap *memory.Buffer, nulls int) *Boolean {
35-
return NewBooleanData(NewData(arrow.FixedWidthTypes.Boolean, length, []*memory.Buffer{nullBitmap, data}, nil, nulls))
35+
return NewBooleanData(NewData(arrow.FixedWidthTypes.Boolean, length, []*memory.Buffer{nullBitmap, data}, nil, nulls, 0))
3636
}
3737

3838
func NewBooleanData(data *Data) *Boolean {

go/arrow/array/booleanbuilder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ func (b *BooleanBuilder) newData() *Data {
144144
// trim buffers
145145
b.data.Resize(bytesRequired)
146146
}
147-
res := NewData(arrow.FixedWidthTypes.Boolean, b.length, []*memory.Buffer{b.nullBitmap, b.data}, nil, b.nulls)
147+
res := NewData(arrow.FixedWidthTypes.Boolean, b.length, []*memory.Buffer{b.nullBitmap, b.data}, nil, b.nulls, 0)
148148
b.reset()
149149

150150
if b.data != nil {

go/arrow/array/data.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,13 @@ type Data struct {
2929
refCount int64
3030
dtype arrow.DataType
3131
nulls int
32+
offset int
3233
length int
3334
buffers []*memory.Buffer // TODO(sgc): should this be an interface?
3435
childData []*Data // TODO(sgc): managed by ListArray, StructArray and UnionArray types
3536
}
3637

37-
func NewData(dtype arrow.DataType, length int, buffers []*memory.Buffer, childData []*Data, nulls int) *Data {
38+
func NewData(dtype arrow.DataType, length int, buffers []*memory.Buffer, childData []*Data, nulls, offset int) *Data {
3839
for _, b := range buffers {
3940
if b != nil {
4041
b.Retain()
@@ -52,6 +53,7 @@ func NewData(dtype arrow.DataType, length int, buffers []*memory.Buffer, childDa
5253
dtype: dtype,
5354
nulls: nulls,
5455
length: length,
56+
offset: offset,
5557
buffers: buffers,
5658
childData: childData,
5759
}
@@ -86,3 +88,44 @@ func (d *Data) Release() {
8688
func (d *Data) DataType() arrow.DataType { return d.dtype }
8789
func (d *Data) NullN() int { return d.nulls }
8890
func (d *Data) Len() int { return d.length }
91+
92+
// NewSliceData returns a new slice that shares backing data with the input.
93+
// The returned Data slice starts at i and extends j-i elements, such as:
94+
// slice := data[i:j]
95+
// The returned value must be Release'd after use.
96+
//
97+
// NewSliceData panics if the slice is outside the valid range of the input Data.
98+
// NewSliceData panics if j < i.
99+
func NewSliceData(data *Data, i, j int64) *Data {
100+
if j > int64(data.length) || i > j || data.offset+int(i) > data.length {
101+
panic("arrow/array: index out of range")
102+
}
103+
104+
for _, b := range data.buffers {
105+
if b != nil {
106+
b.Retain()
107+
}
108+
}
109+
110+
for _, child := range data.childData {
111+
if child != nil {
112+
child.Retain()
113+
}
114+
}
115+
116+
o := &Data{
117+
refCount: 1,
118+
dtype: data.dtype,
119+
nulls: UnknownNullCount,
120+
length: int(j - i),
121+
offset: data.offset + int(i),
122+
buffers: data.buffers,
123+
childData: data.childData,
124+
}
125+
126+
if data.nulls == 0 {
127+
o.nulls = 0
128+
}
129+
130+
return o
131+
}

0 commit comments

Comments
 (0)