From bec4f7611f24ccc37a5e5b3276e9a795e823facc Mon Sep 17 00:00:00 2001 From: Andy Yang Date: Mon, 27 Feb 2023 14:16:28 -0500 Subject: [PATCH] importer: fix scanning of newlines in enclosed fields for DELIMITED DATA Previously, when using `IMPORT INTO ... DELIMITED DATA`, unescaped newlines within quoted fields were treated as row terminators instead of as part of the field. This patch refactors the scanning logic to correctly interpret these newlines. Release note (bug fix): `IMPORT INTO ... DELIMITED DATA` will now correctly handle quoted fields that contain unescaped newlines. --- pkg/sql/importer/import_stmt_test.go | 10 +++++ pkg/sql/importer/read_import_mysqlout.go | 48 ++++++++++++------------ 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/pkg/sql/importer/import_stmt_test.go b/pkg/sql/importer/import_stmt_test.go index 68cc09ee25dd..5644fc8a96e2 100644 --- a/pkg/sql/importer/import_stmt_test.go +++ b/pkg/sql/importer/import_stmt_test.go @@ -415,6 +415,16 @@ ORDER BY table_name `SELECT * from t`: {{"foo", "normal"}}, }, }, + { + name: "unescaped newline in quoted field", + create: `a string, b string`, + with: `WITH fields_enclosed_by = '$'`, + typ: "DELIMITED", + data: "foo\t$foo\nbar$\nfoo\tbar", + query: map[string][][]string{ + `SELECT * FROM t`: {{"foo", "foo\nbar"}, {"foo", "bar"}}, + }, + }, { name: "field enclosure in middle of unquoted field", create: `a string, b string`, diff --git a/pkg/sql/importer/read_import_mysqlout.go b/pkg/sql/importer/read_import_mysqlout.go index 0b3462cd4c29..e1df03ed1674 100644 --- a/pkg/sql/importer/read_import_mysqlout.go +++ b/pkg/sql/importer/read_import_mysqlout.go @@ -84,7 +84,6 @@ type delimitedProducer struct { reader *bufio.Reader row []rune err error - eof bool } var _ importRowProducer = &delimitedProducer{} @@ -94,26 +93,14 @@ func (d *delimitedProducer) Scan() bool { d.row = nil var r rune var w int - nextLiteral := false - fieldEnclosed := false + var gotEncloser, inEnclosedField, inEscapeSeq, inField bool for { r, w, d.err = d.reader.ReadRune() if d.err == io.EOF { - d.eof = true d.err = nil + return d.row != nil } - - if d.eof { - if d.row != nil { - return true - } - if nextLiteral { - d.err = io.ErrUnexpectedEOF - } - return false - } - if d.err != nil { return false } @@ -130,23 +117,34 @@ func (d *delimitedProducer) Scan() bool { r = rune(raw) } - if r == d.opts.RowSeparator && !nextLiteral && !fieldEnclosed { + if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never { + // We only care about well-formed, enclosed fields (i.e. fields that + // start and end with the encloser rune with no additional runes either + // before or after the field). More precisely: 1) an encloser only + // starts a field if it is at the start of a row or immediately follows + // a field terminator and 2) an encloser only ends a field if it is + // immediately followed by the field terminator rune. + // We let FillDatums take care of reporting and handling any errors. + if inEnclosedField && gotEncloser && (r == d.opts.FieldSeparator || r == d.opts.RowSeparator) { + inEnclosedField = false + } + gotEncloser = r == d.opts.Encloser + if gotEncloser && !inField { + inEnclosedField = true + } + } + + if r == d.opts.RowSeparator && !inEscapeSeq && !inEnclosedField { return true } - d.row = append(d.row, r) + inField = !(r == d.opts.FieldSeparator) if d.opts.HasEscape { - nextLiteral = !nextLiteral && r == d.opts.Escape + inEscapeSeq = !inEscapeSeq && r == d.opts.Escape } - if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never && r == d.opts.Encloser { - // We only care about well formed, enclosed fields (i.e. ones that start with - // enclose rune. If we see enclose character anywhere else, then we either - // close the opened enclosing, or we treat this as an invalid enclosing, - // and let FillDatums below take care of reporting and handling any errors. - fieldEnclosed = len(d.row) == 1 - } + d.row = append(d.row, r) } }