Skip to content

Commit a16ffa7

Browse files
authored
ARROW-17095: [Go] Allow Concatenating Dictionary Arrays (apache#13624)
Authored-by: Matthew Topol <mtopol@factset.com> Signed-off-by: Matthew Topol <mtopol@factset.com>
1 parent 4e9053b commit a16ffa7

5 files changed

Lines changed: 381 additions & 1 deletion

File tree

go/arrow/array/binarybuilder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ func (b *BinaryBuilder) Resize(n int) {
180180
}
181181

182182
func (b *BinaryBuilder) ResizeData(n int) {
183-
b.values.length = 0
183+
b.values.length = n
184184
}
185185

186186
// NewArray creates a Binary array from the memory buffers used by the builder and resets the BinaryBuilder

go/arrow/array/concat.go

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626
"github.com/apache/arrow/go/v9/arrow/bitutil"
2727
"github.com/apache/arrow/go/v9/arrow/internal/debug"
2828
"github.com/apache/arrow/go/v9/arrow/memory"
29+
"github.com/apache/arrow/go/v9/internal/bitutils"
30+
"github.com/apache/arrow/go/v9/internal/utils"
2931
)
3032

3133
// Concatenate creates a new arrow.Array which is the concatenation of the
@@ -228,6 +230,85 @@ func concatOffsets(buffers []*memory.Buffer, mem memory.Allocator) (*memory.Buff
228230
return out, valuesRanges, nil
229231
}
230232

233+
func unifyDictionaries(mem memory.Allocator, data []arrow.ArrayData, dt *arrow.DictionaryType) ([]*memory.Buffer, arrow.Array, error) {
234+
unifier, err := NewDictionaryUnifier(mem, dt.ValueType)
235+
if err != nil {
236+
return nil, nil, err
237+
}
238+
defer unifier.Release()
239+
240+
newLookup := make([]*memory.Buffer, len(data))
241+
for i, d := range data {
242+
dictArr := MakeFromData(d.Dictionary())
243+
defer dictArr.Release()
244+
newLookup[i], err = unifier.UnifyAndTranspose(dictArr)
245+
if err != nil {
246+
return nil, nil, err
247+
}
248+
}
249+
250+
unified, err := unifier.GetResultWithIndexType(dt.IndexType)
251+
if err != nil {
252+
for _, b := range newLookup {
253+
b.Release()
254+
}
255+
return nil, nil, err
256+
}
257+
return newLookup, unified, nil
258+
}
259+
260+
func concatDictIndices(mem memory.Allocator, data []arrow.ArrayData, idxType arrow.FixedWidthDataType, transpositions []*memory.Buffer) (out *memory.Buffer, err error) {
261+
defer func() {
262+
if err != nil && out != nil {
263+
out.Release()
264+
out = nil
265+
}
266+
}()
267+
268+
idxWidth := idxType.BitWidth() / 8
269+
outLen := 0
270+
for i, d := range data {
271+
outLen += d.Len()
272+
defer transpositions[i].Release()
273+
}
274+
275+
out = memory.NewResizableBuffer(mem)
276+
out.Resize(outLen * idxWidth)
277+
278+
outData := out.Bytes()
279+
for i, d := range data {
280+
transposeMap := arrow.Int32Traits.CastFromBytes(transpositions[i].Bytes())
281+
src := d.Buffers()[1].Bytes()
282+
if d.Buffers()[0] == nil {
283+
if err = utils.TransposeIntsBuffers(idxType, idxType, src, outData, d.Offset(), 0, d.Len(), transposeMap); err != nil {
284+
return
285+
}
286+
} else {
287+
rdr := bitutils.NewBitRunReader(d.Buffers()[0].Bytes(), int64(d.Offset()), int64(d.Len()))
288+
pos := 0
289+
for {
290+
run := rdr.NextRun()
291+
if run.Len == 0 {
292+
break
293+
}
294+
295+
if run.Set {
296+
err = utils.TransposeIntsBuffers(idxType, idxType, src, outData, d.Offset()+pos, pos, int(run.Len), transposeMap)
297+
if err != nil {
298+
return
299+
}
300+
} else {
301+
memory.Set(outData[pos:pos+(int(run.Len)*idxWidth)], 0x00)
302+
}
303+
304+
pos += int(run.Len)
305+
}
306+
}
307+
outData = outData[d.Len()*idxWidth:]
308+
}
309+
return
310+
}
311+
231312
// concat is the implementation for actually performing the concatenation of the arrow.ArrayData
232313
// objects that we can call internally for nested types.
233314
func concat(data []arrow.ArrayData, mem memory.Allocator) (arrow.ArrayData, error) {
@@ -258,6 +339,42 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arrow.ArrayData, erro
258339
return nil, err
259340
}
260341
out.buffers[1] = bm
342+
case *arrow.DictionaryType:
343+
idxType := dt.IndexType.(arrow.FixedWidthDataType)
344+
// two cases: all dictionaries are the same or we need to unify them
345+
dictsSame := true
346+
dict0 := MakeFromData(data[0].Dictionary())
347+
defer dict0.Release()
348+
for _, d := range data {
349+
dict := MakeFromData(d.Dictionary())
350+
if !Equal(dict0, dict) {
351+
dict.Release()
352+
dictsSame = false
353+
break
354+
}
355+
dict.Release()
356+
}
357+
358+
indexBuffers := gatherBuffersFixedWidthType(data, 1, idxType)
359+
if dictsSame {
360+
out.dictionary = dict0.Data().(*Data)
361+
out.dictionary.Retain()
362+
out.buffers[1] = concatBuffers(indexBuffers, mem)
363+
break
364+
}
365+
366+
indexLookup, unifiedDict, err := unifyDictionaries(mem, data, dt)
367+
if err != nil {
368+
return nil, err
369+
}
370+
defer unifiedDict.Release()
371+
out.dictionary = unifiedDict.Data().(*Data)
372+
out.dictionary.Retain()
373+
374+
out.buffers[1], err = concatDictIndices(mem, data, idxType, indexLookup)
375+
if err != nil {
376+
return nil, err
377+
}
261378
case arrow.FixedWidthDataType:
262379
out.buffers[1] = concatBuffers(gatherBuffersFixedWidthType(data, 1, dt), mem)
263380
case arrow.BinaryDataType:

0 commit comments

Comments
 (0)