Skip to content

Commit

Permalink
importer: treat zero-length string as NULL in CSV import
Browse files Browse the repository at this point in the history
Release note (backward-incompatible change): If no `nullif` option is specified
while using IMPORT CSV, then a zero-length string in the input is now treated as
NULL. The quoted empty string in the input is treated as an empty string. Similarly,
if `nullif` is specified, then an unquoted value is treated as NULL, and a
quoted value is treated as that string. These changes were made to make IMPORT CSV
behave more similarly to COPY CSV.

If the previous behavior (i.e. treating either quoted or unquoted values
that match the `nullif` setting as NULL) is desired, then use the new
`allow_quoted_null` option in the IMPORT statement.
  • Loading branch information
rafiss committed Aug 10, 2022
1 parent 7cefe60 commit c40f381
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 19 deletions.
2 changes: 2 additions & 0 deletions pkg/roachpb/io-formats.proto
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ message CSVOptions {
// Indicates the number of rows to import per CSV file.
// Must be a non-zero positive number.
optional int64 row_limit = 6 [(gogoproto.nullable) = false];
// allow_quoted_null
optional bool allow_quoted_null = 7 [(gogoproto.nullable) = false];
}

// MySQLOutfileOptions describe the format of mysql's outfile.
Expand Down
32 changes: 19 additions & 13 deletions pkg/sql/importer/import_planning.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@ import (
)

const (
csvDelimiter = "delimiter"
csvComment = "comment"
csvNullIf = "nullif"
csvSkip = "skip"
csvRowLimit = "row_limit"
csvStrictQuotes = "strict_quotes"
csvDelimiter = "delimiter"
csvComment = "comment"
csvNullIf = "nullif"
csvSkip = "skip"
csvRowLimit = "row_limit"
csvStrictQuotes = "strict_quotes"
csvAllowQuotedNulls = "allow_quoted_null"

mysqlOutfileRowSep = "rows_terminated_by"
mysqlOutfileFieldSep = "fields_terminated_by"
Expand Down Expand Up @@ -105,12 +106,13 @@ const (
)

var importOptionExpectValues = map[string]sql.KVStringOptValidate{
csvDelimiter: sql.KVStringOptRequireValue,
csvComment: sql.KVStringOptRequireValue,
csvNullIf: sql.KVStringOptRequireValue,
csvSkip: sql.KVStringOptRequireValue,
csvRowLimit: sql.KVStringOptRequireValue,
csvStrictQuotes: sql.KVStringOptRequireNoValue,
csvDelimiter: sql.KVStringOptRequireValue,
csvComment: sql.KVStringOptRequireValue,
csvNullIf: sql.KVStringOptRequireValue,
csvSkip: sql.KVStringOptRequireValue,
csvRowLimit: sql.KVStringOptRequireValue,
csvStrictQuotes: sql.KVStringOptRequireNoValue,
csvAllowQuotedNulls: sql.KVStringOptRequireNoValue,

mysqlOutfileRowSep: sql.KVStringOptRequireValue,
mysqlOutfileFieldSep: sql.KVStringOptRequireValue,
Expand Down Expand Up @@ -169,7 +171,7 @@ var avroAllowedOptions = makeStringSet(
)

var csvAllowedOptions = makeStringSet(
csvDelimiter, csvComment, csvNullIf, csvSkip, csvStrictQuotes, csvRowLimit,
csvDelimiter, csvComment, csvNullIf, csvSkip, csvStrictQuotes, csvRowLimit, csvAllowQuotedNulls,
)

var mysqlOutAllowedOptions = makeStringSet(
Expand Down Expand Up @@ -543,6 +545,10 @@ func importPlanHook(
format.Csv.NullEncoding = &override
}

if _, ok := opts[csvAllowQuotedNulls]; ok {
format.Csv.AllowQuotedNull = true
}

if override, ok := opts[csvSkip]; ok {
skip, err := strconv.Atoi(override)
if err != nil {
Expand Down
100 changes: 96 additions & 4 deletions pkg/sql/importer/import_stmt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,68 @@ ORDER BY table_name
`SELECT * from t`: {{"NULL", "foop"}},
},
},
{
name: "zero string is the default for nullif with CSV",
create: `
i int primary key,
s string
`,
typ: "CSV",
data: `1,
2,""`,
query: map[string][][]string{
`SELECT i, s from t`: {
{"1", "NULL"},
{"2", ""},
},
},
},
{
name: "zero string in not null",
create: `
i int primary key,
s string,
s2 string not null
`,
typ: "CSV",
data: `1,,
2,"",""`,
err: "null value in column \"s2\" violates not-null constraint",
},
{
name: "quoted nullif is treated as a string",
create: `
i int primary key,
s string
`,
with: `WITH nullif = 'foo'`,
typ: "CSV",
data: `1,foo
2,"foo"`,
query: map[string][][]string{
`SELECT i, s from t`: {
{"1", "NULL"},
{"2", "foo"},
},
},
},
{
name: "quoted nullif is treated as a null if allow_quoted_null is used",
create: `
i int primary key,
s string
`,
with: `WITH nullif = 'foo', allow_quoted_null`,
typ: "CSV",
data: `1,foo
2,"foo"`,
query: map[string][][]string{
`SELECT i, s from t`: {
{"1", "NULL"},
{"2", "NULL"},
},
},
},

// PG COPY
{
Expand Down Expand Up @@ -2379,22 +2441,42 @@ func TestImportCSVStmt(t *testing.T) {
f STRING DEFAULT 's',
PRIMARY KEY (a, b, c)
)`
query = `IMPORT INTO t CSV DATA ($1)`
nullif = ` WITH nullif=''`
query = `IMPORT INTO t CSV DATA ($1)`
nullif = ` WITH nullif=''`
allowQuotedNulls = `, allow_quoted_null`
)

sqlDB.Exec(t, create)

data = ",5,e,7,,"
t.Run(data, func(t *testing.T) {
sqlDB.ExpectErr(
t, `row 1: parse "a" as INT8: could not parse ""`,
t, `row 1: generate insert row: null value in column "a" violates not-null constraint`,
query, srv.URL,
)
sqlDB.ExpectErr(
t, `row 1: generate insert row: null value in column "a" violates not-null constraint`,
query+nullif, srv.URL,
)
sqlDB.ExpectErr(
t, `row 1: generate insert row: null value in column "a" violates not-null constraint`,
query+nullif+allowQuotedNulls, srv.URL,
)
})
data = "\"\",5,e,7,,"
t.Run(data, func(t *testing.T) {
sqlDB.ExpectErr(
t, `row 1: parse "a" as INT8: could not parse ""`,
query, srv.URL,
)
sqlDB.ExpectErr(
t, `row 1: parse "a" as INT8: could not parse ""`,
query+nullif, srv.URL,
)
sqlDB.ExpectErr(
t, `row 1: generate insert row: null value in column "a" violates not-null constraint`,
query+nullif+allowQuotedNulls, srv.URL,
)
})
data = "2,5,e,,,"
t.Run(data, func(t *testing.T) {
Expand Down Expand Up @@ -3754,7 +3836,17 @@ func (s *csvBenchmarkStream) Read(buf []byte) (int, error) {
if err != nil {
return 0, err
}
return copy(buf, strings.Join(r.([]string), "\t")+"\n"), nil
row := r.([]csv.Record)
if len(row) == 0 {
return copy(buf, "\n"), nil
}
var b strings.Builder
b.WriteString(row[0].String())
for _, v := range row[1:] {
b.WriteString("\t")
b.WriteString(v.String())
}
return copy(buf, b.String()+"\n"), nil
}
return 0, io.EOF
}
Expand Down
14 changes: 12 additions & 2 deletions pkg/sql/importer/read_import_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,18 @@ func (c *csvRowConsumer) FillDatums(
continue
}

if c.opts.NullEncoding != nil &&
field.Val == *c.opts.NullEncoding {
// NullEncoding is stored as a *string historically, from before we wanted
// it to default to "". Rather than changing the proto, we just set the
// default here.
nullEncoding := ""
if c.opts.NullEncoding != nil {
nullEncoding = *c.opts.NullEncoding
}
if (!field.Quoted || c.opts.AllowQuotedNull) && field.Val == nullEncoding {
// To match COPY, the default behavior is to only treat the field as NULL
// if it was not quoted (and if it matches the configured NullEncoding).
// The AllowQuotedNull option can be used to get the old behavior where
// even a quoted value is treated as NULL.
conv.Datums[datumIdx] = tree.DNull
} else {
var err error
Expand Down
7 changes: 7 additions & 0 deletions pkg/util/encoding/csv/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,13 @@ type Record struct {
Quoted bool
}

func (r *Record) String() string {
if r.Quoted {
return "\"" + r.Val + "\""
}
return r.Val
}

func (r *Reader) readRecord(dst []Record) ([]Record, error) {
if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) {
return nil, errInvalidDelim
Expand Down

0 comments on commit c40f381

Please sign in to comment.