Skip to content

Commit e4ae2f6

Browse files
Anson Qiansbinet
authored andcommitted
ARROW-3951: [Go] implement a CSV writer
@sbinet Author: Anson Qian <abq@uber.com> Closes apache#3755 from anson627/arrow-3951 and squashes the following commits: df1735a <Anson Qian> Fix reader test 9bc8dc0 <Anson Qian> Fix unit test 6e63617 <Anson Qian> Fix typo 7624a97 <Anson Qian> Add example and bump up test coverage f460e19 <Anson Qian> Add newline at end of file 947235c <Anson Qian> Consoliate option for reader and writer 2a57a67 <Anson Qian> Add memory size check e00638e <Anson Qian> Address code reviews 92cbcea <Anson Qian> ARROW-3951 implement a CSV writer
1 parent 3db5797 commit e4ae2f6

5 files changed

Lines changed: 448 additions & 96 deletions

File tree

go/arrow/csv/common.go

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing, software
12+
// distributed under the License is distributed on an "AS IS" BASIS,
13+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
// See the License for the specific language governing permissions and
15+
// limitations under the License.
16+
17+
// Package csv reads CSV files and presents the extracted data as records, also
18+
// writes data as record into CSV files
19+
package csv
20+
21+
import (
22+
"errors"
23+
"fmt"
24+
25+
"github.com/apache/arrow/go/arrow"
26+
"github.com/apache/arrow/go/arrow/memory"
27+
)
28+
29+
var (
30+
ErrMismatchFields = errors.New("arrow/csv: number of records mismatch")
31+
)
32+
33+
// Option configures a CSV reader/writer.
34+
type Option func(config)
35+
type config interface{}
36+
37+
// WithComma specifies the fields separation character used while parsing CSV files.
38+
func WithComma(c rune) Option {
39+
return func(cfg config) {
40+
switch cfg := cfg.(type) {
41+
case *Reader:
42+
cfg.r.Comma = c
43+
case *Writer:
44+
cfg.w.Comma = c
45+
default:
46+
panic(fmt.Errorf("arrow/csv: unknown config type %T", cfg))
47+
}
48+
}
49+
}
50+
51+
// WithComment specifies the comment character used while parsing CSV files.
52+
func WithComment(c rune) Option {
53+
return func(cfg config) {
54+
switch cfg := cfg.(type) {
55+
case *Reader:
56+
cfg.r.Comment = c
57+
default:
58+
panic(fmt.Errorf("arrow/csv: unknown config type %T", cfg))
59+
}
60+
}
61+
}
62+
63+
// WithAllocator specifies the Arrow memory allocator used while building records.
64+
func WithAllocator(mem memory.Allocator) Option {
65+
return func(cfg config) {
66+
switch cfg := cfg.(type) {
67+
case *Reader:
68+
cfg.mem = mem
69+
default:
70+
panic(fmt.Errorf("arrow/csv: unknown config type %T", cfg))
71+
}
72+
}
73+
}
74+
75+
// WithChunk specifies the chunk size used while parsing CSV files.
76+
//
77+
// If n is zero or 1, no chunking will take place and the reader will create
78+
// one record per row.
79+
// If n is greater than 1, chunks of n rows will be read.
80+
// If n is negative, the reader will load the whole CSV file into memory and
81+
// create one big record with all the rows.
82+
func WithChunk(n int) Option {
83+
return func(cfg config) {
84+
switch cfg := cfg.(type) {
85+
case *Reader:
86+
cfg.chunk = n
87+
default:
88+
panic(fmt.Errorf("arrow/csv: unknown config type %T", cfg))
89+
}
90+
}
91+
}
92+
93+
// WithCRLF specifies the line terminator used while writing CSV files.
94+
// If useCRLF is true, \r\n is used as the line terminator, otherwise \n is used.
95+
// The default value is false.
96+
func WithCRLF(useCRLF bool) Option {
97+
return func(cfg config) {
98+
switch cfg := cfg.(type) {
99+
case *Writer:
100+
cfg.w.UseCRLF = useCRLF
101+
default:
102+
panic(fmt.Errorf("arrow/csv: unknown config type %T", cfg))
103+
}
104+
}
105+
}
106+
107+
func validate(schema *arrow.Schema) {
108+
for i, f := range schema.Fields() {
109+
switch ft := f.Type.(type) {
110+
case *arrow.BooleanType:
111+
case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type:
112+
case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type:
113+
case *arrow.Float32Type, *arrow.Float64Type:
114+
case *arrow.StringType:
115+
default:
116+
panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft))
117+
}
118+
}
119+
}
Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,10 @@
1414
// See the License for the specific language governing permissions and
1515
// limitations under the License.
1616

17-
// Package csv reads CSV files and presents the extracted data as records.
1817
package csv
1918

2019
import (
2120
"encoding/csv"
22-
"errors"
23-
"fmt"
2421
"io"
2522
"strconv"
2623
"sync/atomic"
@@ -31,47 +28,6 @@ import (
3128
"github.com/apache/arrow/go/arrow/memory"
3229
)
3330

34-
var (
35-
ErrMismatchFields = errors.New("arrow/csv: number of records mismatch")
36-
)
37-
38-
// Option configures a CSV reader.
39-
type Option func(*Reader)
40-
41-
// WithComment specifies the comment character used while parsing CSV files.
42-
func WithComment(c rune) Option {
43-
return func(r *Reader) {
44-
r.r.Comment = c
45-
}
46-
}
47-
48-
// WithComma specifies the fields separation character used while parsing CSV files.
49-
func WithComma(c rune) Option {
50-
return func(r *Reader) {
51-
r.r.Comma = c
52-
}
53-
}
54-
55-
// WithAllocator specifies the Arrow memory allocator used while building records.
56-
func WithAllocator(mem memory.Allocator) Option {
57-
return func(r *Reader) {
58-
r.mem = mem
59-
}
60-
}
61-
62-
// WithChunk specifies the chunk size used while parsing CSV files.
63-
//
64-
// If n is zero or 1, no chunking will take place and the reader will create
65-
// one record per row.
66-
// If n is greater than 1, chunks of n rows will be read.
67-
// If n is negative, the reader will load the whole CSV file into memory and
68-
// create one big record with all the rows.
69-
func WithChunk(n int) Option {
70-
return func(r *Reader) {
71-
r.chunk = n
72-
}
73-
}
74-
7531
// Reader wraps encoding/csv.Reader and creates array.Records from a schema.
7632
type Reader struct {
7733
r *csv.Reader
@@ -392,20 +348,6 @@ func (r *Reader) Release() {
392348
}
393349
}
394350

395-
func validate(schema *arrow.Schema) {
396-
for i, f := range schema.Fields() {
397-
switch ft := f.Type.(type) {
398-
case *arrow.BooleanType:
399-
case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type:
400-
case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type:
401-
case *arrow.Float32Type, *arrow.Float64Type:
402-
case *arrow.StringType:
403-
default:
404-
panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft))
405-
}
406-
}
407-
}
408-
409351
var (
410352
_ array.RecordReader = (*Reader)(nil)
411353
)
Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -56,42 +56,42 @@ func Example() {
5656
for r.Next() {
5757
rec := r.Record()
5858
for i, col := range rec.Columns() {
59-
fmt.Printf("rec[%d][%q]: %v\n", i, rec.ColumnName(i), col)
59+
fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
6060
}
6161
n++
6262
}
6363

6464
// Output:
6565
// rec[0]["i64"]: [0]
66-
// rec[1]["f64"]: [0]
67-
// rec[2]["str"]: ["str-0"]
68-
// rec[0]["i64"]: [1]
66+
// rec[0]["f64"]: [0]
67+
// rec[0]["str"]: ["str-0"]
68+
// rec[1]["i64"]: [1]
6969
// rec[1]["f64"]: [1]
70-
// rec[2]["str"]: ["str-1"]
71-
// rec[0]["i64"]: [2]
72-
// rec[1]["f64"]: [2]
70+
// rec[1]["str"]: ["str-1"]
71+
// rec[2]["i64"]: [2]
72+
// rec[2]["f64"]: [2]
7373
// rec[2]["str"]: ["str-2"]
74-
// rec[0]["i64"]: [3]
75-
// rec[1]["f64"]: [3]
76-
// rec[2]["str"]: ["str-3"]
77-
// rec[0]["i64"]: [4]
78-
// rec[1]["f64"]: [4]
79-
// rec[2]["str"]: ["str-4"]
80-
// rec[0]["i64"]: [5]
81-
// rec[1]["f64"]: [5]
82-
// rec[2]["str"]: ["str-5"]
83-
// rec[0]["i64"]: [6]
84-
// rec[1]["f64"]: [6]
85-
// rec[2]["str"]: ["str-6"]
86-
// rec[0]["i64"]: [7]
87-
// rec[1]["f64"]: [7]
88-
// rec[2]["str"]: ["str-7"]
89-
// rec[0]["i64"]: [8]
90-
// rec[1]["f64"]: [8]
91-
// rec[2]["str"]: ["str-8"]
92-
// rec[0]["i64"]: [9]
93-
// rec[1]["f64"]: [9]
94-
// rec[2]["str"]: ["str-9"]
74+
// rec[3]["i64"]: [3]
75+
// rec[3]["f64"]: [3]
76+
// rec[3]["str"]: ["str-3"]
77+
// rec[4]["i64"]: [4]
78+
// rec[4]["f64"]: [4]
79+
// rec[4]["str"]: ["str-4"]
80+
// rec[5]["i64"]: [5]
81+
// rec[5]["f64"]: [5]
82+
// rec[5]["str"]: ["str-5"]
83+
// rec[6]["i64"]: [6]
84+
// rec[6]["f64"]: [6]
85+
// rec[6]["str"]: ["str-6"]
86+
// rec[7]["i64"]: [7]
87+
// rec[7]["f64"]: [7]
88+
// rec[7]["str"]: ["str-7"]
89+
// rec[8]["i64"]: [8]
90+
// rec[8]["f64"]: [8]
91+
// rec[8]["str"]: ["str-8"]
92+
// rec[9]["i64"]: [9]
93+
// rec[9]["f64"]: [9]
94+
// rec[9]["str"]: ["str-9"]
9595
}
9696

9797
func Example_withChunk() {
@@ -127,24 +127,24 @@ func Example_withChunk() {
127127
for r.Next() {
128128
rec := r.Record()
129129
for i, col := range rec.Columns() {
130-
fmt.Printf("rec[%d][%q]: %v\n", i, rec.ColumnName(i), col)
130+
fmt.Printf("rec[%d][%q]: %v\n", n, rec.ColumnName(i), col)
131131
}
132132
n++
133133
}
134134

135135
// Output:
136136
// rec[0]["i64"]: [0 1 2]
137-
// rec[1]["f64"]: [0 1 2]
138-
// rec[2]["str"]: ["str-0" "str-1" "str-2"]
139-
// rec[0]["i64"]: [3 4 5]
137+
// rec[0]["f64"]: [0 1 2]
138+
// rec[0]["str"]: ["str-0" "str-1" "str-2"]
139+
// rec[1]["i64"]: [3 4 5]
140140
// rec[1]["f64"]: [3 4 5]
141-
// rec[2]["str"]: ["str-3" "str-4" "str-5"]
142-
// rec[0]["i64"]: [6 7 8]
143-
// rec[1]["f64"]: [6 7 8]
141+
// rec[1]["str"]: ["str-3" "str-4" "str-5"]
142+
// rec[2]["i64"]: [6 7 8]
143+
// rec[2]["f64"]: [6 7 8]
144144
// rec[2]["str"]: ["str-6" "str-7" "str-8"]
145-
// rec[0]["i64"]: [9]
146-
// rec[1]["f64"]: [9]
147-
// rec[2]["str"]: ["str-9"]
145+
// rec[3]["i64"]: [9]
146+
// rec[3]["f64"]: [9]
147+
// rec[3]["str"]: ["str-9"]
148148
}
149149

150150
func TestCSVReader(t *testing.T) {

0 commit comments

Comments
 (0)