Skip to content

Commit

Permalink
importer: fix scanning of newlines in enclosed fields for DELIMITED DATA
Browse files Browse the repository at this point in the history
Previously, when using `IMPORT INTO ... DELIMITED DATA`, unescaped
newlines within quoted fields were treated as row terminators instead
of as part of the field. This patch refactors the scanning logic to
correctly interpret these newlines.

Release note (bug fix): `IMPORT INTO ... DELIMITED DATA` will now
correctly handle quoted fields that contain unescaped newlines.
  • Loading branch information
andyyang890 committed Feb 27, 2023
1 parent c5068fc commit bec4f76
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 25 deletions.
10 changes: 10 additions & 0 deletions pkg/sql/importer/import_stmt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,16 @@ ORDER BY table_name
`SELECT * from t`: {{"foo", "normal"}},
},
},
{
name: "unescaped newline in quoted field",
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$'`,
typ: "DELIMITED",
data: "foo\t$foo\nbar$\nfoo\tbar",
query: map[string][][]string{
`SELECT * FROM t`: {{"foo", "foo\nbar"}, {"foo", "bar"}},
},
},
{
name: "field enclosure in middle of unquoted field",
create: `a string, b string`,
Expand Down
48 changes: 23 additions & 25 deletions pkg/sql/importer/read_import_mysqlout.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ type delimitedProducer struct {
reader *bufio.Reader
row []rune
err error
eof bool
}

var _ importRowProducer = &delimitedProducer{}
Expand All @@ -94,26 +93,14 @@ func (d *delimitedProducer) Scan() bool {
d.row = nil
var r rune
var w int
nextLiteral := false
fieldEnclosed := false
var gotEncloser, inEnclosedField, inEscapeSeq, inField bool

for {
r, w, d.err = d.reader.ReadRune()
if d.err == io.EOF {
d.eof = true
d.err = nil
return d.row != nil
}

if d.eof {
if d.row != nil {
return true
}
if nextLiteral {
d.err = io.ErrUnexpectedEOF
}
return false
}

if d.err != nil {
return false
}
Expand All @@ -130,23 +117,34 @@ func (d *delimitedProducer) Scan() bool {
r = rune(raw)
}

if r == d.opts.RowSeparator && !nextLiteral && !fieldEnclosed {
if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never {
// We only care about well-formed, enclosed fields (i.e. fields that
// start and end with the encloser rune with no additional runes either
// before or after the field). More precisely: 1) an encloser only
// starts a field if it is at the start of a row or immediately follows
// a field terminator and 2) an encloser only ends a field if it is
// immediately followed by the field terminator rune.
// We let FillDatums take care of reporting and handling any errors.
if inEnclosedField && gotEncloser && (r == d.opts.FieldSeparator || r == d.opts.RowSeparator) {
inEnclosedField = false
}
gotEncloser = r == d.opts.Encloser
if gotEncloser && !inField {
inEnclosedField = true
}
}

if r == d.opts.RowSeparator && !inEscapeSeq && !inEnclosedField {
return true
}

d.row = append(d.row, r)
inField = !(r == d.opts.FieldSeparator)

if d.opts.HasEscape {
nextLiteral = !nextLiteral && r == d.opts.Escape
inEscapeSeq = !inEscapeSeq && r == d.opts.Escape
}

if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never && r == d.opts.Encloser {
// We only care about well formed, enclosed fields (i.e. ones that start with
// enclose rune. If we see enclose character anywhere else, then we either
// close the opened enclosing, or we treat this as an invalid enclosing,
// and let FillDatums below take care of reporting and handling any errors.
fieldEnclosed = len(d.row) == 1
}
d.row = append(d.row, r)
}
}

Expand Down

0 comments on commit bec4f76

Please sign in to comment.