Skip to content

Commit b31c9ac

Browse files
authored
ARROW-17749: [Go] Implement Filter and Take for Structs (apache#14145)
Authored-by: Matt Topol <zotthewizard@gmail.com> Signed-off-by: Matt Topol <zotthewizard@gmail.com>
1 parent de1ada3 commit b31c9ac

6 files changed

Lines changed: 547 additions & 16 deletions

File tree

go/arrow/array/concat.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,12 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arrow.ArrayData, erro
382382
out.buffers[0] = bm
383383
}
384384

385-
switch dt := out.dtype.(type) {
385+
dt := out.dtype
386+
if dt.ID() == arrow.EXTENSION {
387+
dt = dt.(arrow.ExtensionType).StorageType()
388+
}
389+
390+
switch dt := dt.(type) {
386391
case *arrow.NullType:
387392
case *arrow.BooleanType:
388393
bm, err := concatBitmaps(gatherBitmaps(data, 1), mem)

go/arrow/compute/internal/exec/span.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ func (a *ArraySpan) MakeData() arrow.ArrayData {
162162
return result
163163
} else if dt.ID() == arrow.DENSE_UNION || dt.ID() == arrow.SPARSE_UNION {
164164
bufs[0] = nil
165+
nulls = 0
165166
}
166167

167168
if len(a.Children) > 0 {

go/arrow/compute/internal/kernels/helpers.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,10 @@ func (bldr *execBufBuilder) unsafeAdvance(n int) {
579579
}
580580

581581
func (bldr *execBufBuilder) finish() (buf *memory.Buffer) {
582+
if bldr.buffer == nil {
583+
buf = memory.NewBufferBytes(nil)
584+
return
585+
}
582586
bldr.buffer.Resize(bldr.sz)
583587
buf = bldr.buffer
584588
bldr.buffer, bldr.sz = nil, 0

go/arrow/compute/internal/kernels/vector_selection.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,14 @@ package kernels
1818

1919
import (
2020
"fmt"
21+
"math"
2122

2223
"github.com/apache/arrow/go/v10/arrow"
2324
"github.com/apache/arrow/go/v10/arrow/array"
2425
"github.com/apache/arrow/go/v10/arrow/bitutil"
2526
"github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
27+
"github.com/apache/arrow/go/v10/arrow/internal/debug"
28+
"github.com/apache/arrow/go/v10/arrow/memory"
2629
"github.com/apache/arrow/go/v10/internal/bitutils"
2730
)
2831

@@ -87,6 +90,149 @@ func preallocateData(ctx *exec.KernelCtx, length int64, bitWidth int, allocateVa
8790
}
8891
}
8992

93+
type builder[T any] interface {
94+
array.Builder
95+
Append(T)
96+
UnsafeAppend(T)
97+
UnsafeAppendBoolToBitmap(bool)
98+
}
99+
100+
func getTakeIndices[T exec.IntTypes | exec.UintTypes](mem memory.Allocator, filter *exec.ArraySpan, nullSelect NullSelectionBehavior) arrow.ArrayData {
101+
var (
102+
filterData = filter.Buffers[1].Buf
103+
haveFilterNulls = filter.MayHaveNulls()
104+
filterIsValid = filter.Buffers[0].Buf
105+
idxType = exec.GetDataType[T]()
106+
)
107+
108+
if haveFilterNulls && nullSelect == EmitNulls {
109+
// Most complex case: the filter may have nulls and we don't drop them.
110+
// The logic is ternary:
111+
// - filter is null: emit null
112+
// - filter is valid and true: emit index
113+
// - filter is valid and false: don't emit anything
114+
115+
bldr := array.NewBuilder(mem, idxType).(builder[T])
116+
defer bldr.Release()
117+
118+
// position relative to start of filter
119+
var pos T
120+
// current position taking the filter offset into account
121+
posWithOffset := filter.Offset
122+
123+
// to count blocks where filterData[i] || !filterIsValid[i]
124+
filterCounter := bitutils.NewBinaryBitBlockCounter(filterData, filterIsValid, filter.Offset, filter.Offset, filter.Len)
125+
isValidCounter := bitutils.NewBitBlockCounter(filterIsValid, filter.Offset, filter.Len)
126+
for int64(pos) < filter.Len {
127+
// true OR NOT valid
128+
selectedOrNullBlock := filterCounter.NextOrNotWord()
129+
if selectedOrNullBlock.NoneSet() {
130+
pos += T(selectedOrNullBlock.Len)
131+
posWithOffset += int64(selectedOrNullBlock.Len)
132+
continue
133+
}
134+
bldr.Reserve(int(selectedOrNullBlock.Popcnt))
135+
136+
// if the values are all valid and the selectedOrNullBlock
137+
// is full, then we can infer that all the values are true
138+
// and skip the bit checking
139+
isValidBlock := isValidCounter.NextWord()
140+
if selectedOrNullBlock.AllSet() && isValidBlock.AllSet() {
141+
// all the values are selected and non-null
142+
for i := 0; i < int(selectedOrNullBlock.Len); i++ {
143+
bldr.UnsafeAppend(pos)
144+
pos++
145+
}
146+
posWithOffset += int64(selectedOrNullBlock.Len)
147+
} else {
148+
// some of the values are false or null
149+
for i := 0; i < int(selectedOrNullBlock.Len); i++ {
150+
if bitutil.BitIsSet(filterIsValid, int(posWithOffset)) {
151+
if bitutil.BitIsSet(filterData, int(posWithOffset)) {
152+
bldr.UnsafeAppend(pos)
153+
}
154+
} else {
155+
// null slot, append null
156+
bldr.UnsafeAppendBoolToBitmap(false)
157+
}
158+
pos++
159+
posWithOffset++
160+
}
161+
}
162+
}
163+
164+
result := bldr.NewArray()
165+
defer result.Release()
166+
result.Data().Retain()
167+
return result.Data()
168+
}
169+
170+
bldr := newBufferBuilder[T](mem)
171+
if haveFilterNulls {
172+
// the filter may have nulls, so we scan the validity bitmap
173+
// and the filter data bitmap together
174+
debug.Assert(nullSelect == DropNulls, "incorrect nullselect logic")
175+
176+
// position relative to start of the filter
177+
var pos T
178+
// current position taking the filter offset into account
179+
posWithOffset := filter.Offset
180+
181+
filterCounter := bitutils.NewBinaryBitBlockCounter(filterData, filterIsValid, filter.Offset, filter.Offset, filter.Len)
182+
for int64(pos) < filter.Len {
183+
andBlock := filterCounter.NextAndWord()
184+
bldr.reserve(int(andBlock.Popcnt))
185+
if andBlock.AllSet() {
186+
// all the values are selected and non-null
187+
for i := 0; i < int(andBlock.Len); i++ {
188+
bldr.unsafeAppend(pos)
189+
pos++
190+
}
191+
posWithOffset += int64(andBlock.Len)
192+
} else if !andBlock.NoneSet() {
193+
// some values are false or null
194+
for i := 0; i < int(andBlock.Len); i++ {
195+
if bitutil.BitIsSet(filterIsValid, int(posWithOffset)) && bitutil.BitIsSet(filterData, int(posWithOffset)) {
196+
bldr.unsafeAppend(pos)
197+
}
198+
pos++
199+
posWithOffset++
200+
}
201+
} else {
202+
pos += T(andBlock.Len)
203+
posWithOffset += int64(andBlock.Len)
204+
}
205+
}
206+
} else {
207+
// filter has no nulls, so we only need to look for true values
208+
bitutils.VisitSetBitRuns(filterData, filter.Offset, filter.Len,
209+
func(pos, length int64) error {
210+
// append consecutive run of indices
211+
bldr.reserve(int(length))
212+
for i := int64(0); i < length; i++ {
213+
bldr.unsafeAppend(T(pos + i))
214+
}
215+
return nil
216+
})
217+
}
218+
219+
length := bldr.len()
220+
outBuf := bldr.finish()
221+
defer outBuf.Release()
222+
return array.NewData(idxType, length, []*memory.Buffer{nil, outBuf}, nil, 0, 0)
223+
}
224+
225+
func GetTakeIndices(mem memory.Allocator, filter *exec.ArraySpan, nullSelect NullSelectionBehavior) (arrow.ArrayData, error) {
226+
debug.Assert(filter.Type.ID() == arrow.BOOL, "filter should be a boolean array")
227+
if filter.Len < math.MaxUint16 {
228+
return getTakeIndices[uint16](mem, filter, nullSelect), nil
229+
} else if filter.Len < math.MaxUint32 {
230+
return getTakeIndices[uint32](mem, filter, nullSelect), nil
231+
}
232+
return nil, fmt.Errorf("%w: filter length exceeds UINT32_MAX, consider a different strategy for selecting elements",
233+
arrow.ErrNotImplemented)
234+
}
235+
90236
type writeFiltered interface {
91237
OutPos() int
92238
WriteValue(int64)
@@ -1121,6 +1267,9 @@ func VarBinaryImpl[OffsetT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecS
11211267
valOffset := rawOffsets[idx]
11221268
valSize := rawOffsets[idx+1] - valOffset
11231269

1270+
if valSize == 0 {
1271+
return nil
1272+
}
11241273
offset += valSize
11251274
if valSize > OffsetT(spaceAvail) {
11261275
dataBuilder.reserve(int(valSize))
@@ -1362,6 +1511,19 @@ func FilterBinary(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResul
13621511
return fmt.Errorf("%w: invalid type for binary filter", arrow.ErrInvalid)
13631512
}
13641513

1514+
func visitNoop() error { return nil }
1515+
func visitIdxNoop(int64) error { return nil }
1516+
1517+
func StructImpl(ctx *exec.KernelCtx, batch *exec.ExecSpan, outputLength int64, out *exec.ExecResult, fn selectionOutputFn) error {
1518+
var (
1519+
values = &batch.Values[0].Array
1520+
selection = &batch.Values[1].Array
1521+
)
1522+
1523+
// nothing we need to do other than generate the validity bitmap
1524+
return fn(ctx, outputLength, values, selection, out, visitIdxNoop, visitNoop)
1525+
}
1526+
13651527
type SelectionKernelData struct {
13661528
In exec.InputType
13671529
Exec exec.ArrayKernelExec

0 commit comments

Comments
 (0)