Skip to content

Commit

Permalink
apacheGH-39552: [Go] inclusion of option to use replacer when creatin…
Browse files Browse the repository at this point in the history
…g csv strings with go library (apache#39576)

Rationale for this change
Make it possible to remove unwanted characters from strings

What changes are included in this PR?
Add new function to optionally  setup a replacer in csv Writer  Write method

Are these changes tested?
Yes

Are there any user-facing changes?
Adds an optional methods.

* Closes: apache#39552

Lead-authored-by: Jânio <[email protected]>
Co-authored-by: janiodev <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
  • Loading branch information
2 people authored and thisisnic committed Mar 8, 2024
1 parent d4473bb commit fed2ceb
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 19 deletions.
14 changes: 14 additions & 0 deletions go/arrow/csv/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package csv
import (
"errors"
"fmt"
"strings"

"github.com/apache/arrow/go/v15/arrow"
"github.com/apache/arrow/go/v15/arrow/memory"
Expand Down Expand Up @@ -223,6 +224,19 @@ func WithIncludeColumns(cols []string) Option {
}
}

// WithStringsReplacer receives a replacer to be applied in the string fields
// of the CSV. This is useful to remove unwanted characters from the string.
func WithStringsReplacer(replacer *strings.Replacer) Option {
return func(cfg config) {
switch cfg := cfg.(type) {
case *Writer:
cfg.stringReplacer = replacer.Replace
default:
panic(fmt.Errorf("arrow/csv: unknown config type %T", cfg))
}
}
}

func validate(schema *arrow.Schema) {
for i, f := range schema.Fields() {
switch ft := f.Type.(type) {
Expand Down
12 changes: 6 additions & 6 deletions go/arrow/csv/transformer.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import (
"github.com/apache/arrow/go/v15/arrow/array"
)

func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) []string {
func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, stringsReplacer func(string)string) []string {
res := make([]string, col.Len())
switch typ.(type) {
case *arrow.BooleanType:
Expand Down Expand Up @@ -144,7 +144,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) []
arr := col.(*array.String)
for i := 0; i < arr.Len(); i++ {
if arr.IsValid(i) {
res[i] = arr.Value(i)
res[i] = stringsReplacer(arr.Value(i))
} else {
res[i] = w.nullValue
}
Expand All @@ -153,7 +153,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) []
arr := col.(*array.LargeString)
for i := 0; i < arr.Len(); i++ {
if arr.IsValid(i) {
res[i] = arr.Value(i)
res[i] = stringsReplacer(arr.Value(i))
} else {
res[i] = w.nullValue
}
Expand Down Expand Up @@ -224,7 +224,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) []
var b bytes.Buffer
b.Write([]byte{'{'})
writer := csv.NewWriter(&b)
writer.Write(w.transformColToStringArr(list.DataType(), list))
writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer))
writer.Flush()
b.Truncate(b.Len() - 1)
b.Write([]byte{'}'})
Expand All @@ -243,7 +243,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) []
var b bytes.Buffer
b.Write([]byte{'{'})
writer := csv.NewWriter(&b)
writer.Write(w.transformColToStringArr(list.DataType(), list))
writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer))
writer.Flush()
b.Truncate(b.Len() - 1)
b.Write([]byte{'}'})
Expand All @@ -262,7 +262,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) []
var b bytes.Buffer
b.Write([]byte{'{'})
writer := csv.NewWriter(&b)
writer.Write(w.transformColToStringArr(list.DataType(), list))
writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer))
writer.Flush()
b.Truncate(b.Len() - 1)
b.Write([]byte{'}'})
Expand Down
24 changes: 13 additions & 11 deletions go/arrow/csv/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,13 @@ import (

// Writer wraps encoding/csv.Writer and writes arrow.Record based on a schema.
type Writer struct {
boolFormatter func(bool) string
header bool
nullValue string
once sync.Once
schema *arrow.Schema
w *csv.Writer
boolFormatter func(bool) string
header bool
nullValue string
stringReplacer func(string) string
once sync.Once
schema *arrow.Schema
w *csv.Writer
}

// NewWriter returns a writer that writes arrow.Records to the CSV file
Expand All @@ -45,10 +46,11 @@ func NewWriter(w io.Writer, schema *arrow.Schema, opts ...Option) *Writer {
validate(schema)

ww := &Writer{
boolFormatter: strconv.FormatBool, // override by passing WithBoolWriter() as an option
nullValue: "NULL", // override by passing WithNullWriter() as an option
schema: schema,
w: csv.NewWriter(w),
boolFormatter: strconv.FormatBool, // override by passing WithBoolWriter() as an option
nullValue: "NULL", // override by passing WithNullWriter() as an option
stringReplacer: func(x string) string { return x }, // override by passing WithStringsReplacer() as an option
schema: schema,
w: csv.NewWriter(w),
}
for _, opt := range opts {
opt(ww)
Expand Down Expand Up @@ -81,7 +83,7 @@ func (w *Writer) Write(record arrow.Record) error {
}

for j, col := range record.Columns() {
rows := w.transformColToStringArr(w.schema.Field(j).Type, col)
rows := w.transformColToStringArr(w.schema.Field(j).Type, col, w.stringReplacer)
for i, row := range rows {
recs[i][j] = row
}
Expand Down
6 changes: 4 additions & 2 deletions go/arrow/csv/writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"fmt"
"io"
"log"
"strings"
"testing"

"github.com/apache/arrow/go/v15/arrow"
Expand Down Expand Up @@ -250,8 +251,8 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo
b.Field(9).(*array.Float16Builder).AppendValues([]float16.Num{float16.New(0.0), float16.New(0.1), float16.New(0.2)}, nil)
b.Field(10).(*array.Float32Builder).AppendValues([]float32{0.0, 0.1, 0.2}, nil)
b.Field(11).(*array.Float64Builder).AppendValues([]float64{0.0, 0.1, 0.2}, nil)
b.Field(12).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil)
b.Field(13).(*array.LargeStringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil)
b.Field(12).(*array.StringBuilder).AppendValues([]string{"str_0", "str-1", "str-2"}, nil)
b.Field(13).(*array.LargeStringBuilder).AppendValues([]string{"str_0", "str-1", "str-2"}, nil)
b.Field(14).(*array.TimestampBuilder).AppendValues(genTimestamps(arrow.Second), nil)
b.Field(15).(*array.Date32Builder).AppendValues([]arrow.Date32{17304, 19304, 20304}, nil)
b.Field(16).(*array.Date64Builder).AppendValues([]arrow.Date64{1840400000000, 1940400000000, 2040400000000}, nil)
Expand Down Expand Up @@ -300,6 +301,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo
csv.WithHeader(writeHeader),
csv.WithNullWriter(nullVal),
csv.WithBoolWriter(fmtr),
csv.WithStringsReplacer(strings.NewReplacer("_", "-")),
)
err := w.Write(rec)
if err != nil {
Expand Down

0 comments on commit fed2ceb

Please sign in to comment.