importer: fix collated string imports for MySQL/DELIMITED DATA

We previously special cased `ParseDatumStringWithRawBytes` for MySQL related imports, which was buggy for collated strings as well. Instead, make import have a specialised method for converting MySQL literals to the relevant data type without compromising on collated strings. Release note (bug fix): Previously, using IMPORT INTO for DELIMITED DATA or MySQL imports would error with `column ... does not exist` if it was importing into a collated string column.
cockroachdb · Aug 7, 2023 · 9679561 · 9679561
1 parent 4bc18b6
commit 9679561
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 27 deletions.
diff --git a/pkg/sql/importer/import_stmt_test.go b/pkg/sql/importer/import_stmt_test.go
@@ -5695,6 +5695,16 @@ func TestImportDelimited(t *testing.T) {
 				sqlDB.Exec(t, intoCmd, opts...)
 				checkQueryResults(t, fmt.Sprintf(`SELECT i, s, b FROM into%d ORDER BY i`, i))
 			})
+			t.Run("import-into-collated", func(t *testing.T) {
+				defer sqlDB.Exec(t, fmt.Sprintf(`DROP TABLE into%d`, i))
+				sqlDB.Exec(t, fmt.Sprintf(`CREATE TABLE into%d (i INT8 PRIMARY KEY, s text COLLATE "en_US", b bytea)`, i))
+				intoCmd := fmt.Sprintf(`IMPORT INTO into%d (i, s, b) DELIMITED DATA ($1)`, i)
+				if len(flags) > 0 {
+					intoCmd += " WITH " + strings.Join(flags, ", ")
+				}
+				sqlDB.Exec(t, intoCmd, opts...)
+				checkQueryResults(t, fmt.Sprintf(`SELECT i, s, b FROM into%d ORDER BY i`, i))
+			})
 			t.Run("import-into-target-cols-reordered", func(t *testing.T) {
 				defer sqlDB.Exec(t, fmt.Sprintf(`DROP TABLE into%d`, i))
 				sqlDB.Exec(t, fmt.Sprintf(`CREATE TABLE into%d (b bytea, i INT8 PRIMARY KEY, s text)`, i))

diff --git a/pkg/sql/importer/read_import_mysql.go b/pkg/sql/importer/read_import_mysql.go
@@ -206,6 +206,20 @@ const (
 	zeroTime = "0000-00-00 00:00:00"
 )
 
+func mysqlStrToDatum(evalCtx *eval.Context, s string, desired *types.T) (tree.Datum, error) {
+	switch desired.Family() {
+	case types.BytesFamily:
+		// mysql emits raw byte strings that do not use the same escaping as our
+		// tree.ParseDBytes function expects, and the difference between
+		// tree.ParseAndRequireString and mysqlStrToDatum is whether or not it
+		// attempts to parse bytes.
+		return tree.NewDBytes(tree.DBytes(s)), nil
+	default:
+		res, _, err := tree.ParseAndRequireString(desired, s, evalCtx)
+		return res, err
+	}
+}
+
 // mysqlValueToDatum attempts to convert a value, as parsed from a mysqldump
 // INSERT statement, in to a Cockroach Datum of type `desired`. The MySQL parser
 // does not parse the values themselves to Go primitivies, rather leaving the
@@ -239,11 +253,7 @@ func mysqlValueToDatum(
 					}
 				}
 			}
-			// This uses ParseDatumStringAsWithRawBytes instead of ParseDatumStringAs since mysql emits
-			// raw byte strings that do not use the same escaping as our ParseBytes
-			// function expects, and the difference between ParseStringAs and
-			// ParseDatumStringAs is whether or not it attempts to parse bytes.
-			return rowenc.ParseDatumStringAsWithRawBytes(ctx, desired, s, evalContext)
+			return mysqlStrToDatum(evalContext, s, desired)
 		case mysql.IntVal:
 			return rowenc.ParseDatumStringAs(ctx, desired, string(v.Val), evalContext)
 		case mysql.FloatVal:

diff --git a/pkg/sql/importer/read_import_mysql_test.go b/pkg/sql/importer/read_import_mysql_test.go
@@ -43,6 +43,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
 	"github.com/kr/pretty"
+	"github.com/stretchr/testify/require"
 	mysql "vitess.io/vitess/go/vt/sqlparser"
 )
 
@@ -356,13 +357,16 @@ func TestMysqlValueToDatum(t *testing.T) {
 		}
 		return d
 	}
+	asdfCS, err := tree.NewDCollatedString("Asdf", "en_US", &tree.CollationEnvironment{})
+	require.NoError(t, err)
 	tests := []struct {
 		raw  mysql.Expr
 		typ  *types.T
 		want tree.Datum
 	}{
 		{raw: mysql.NewStrLiteral([]byte("0000-00-00")), typ: types.Date, want: tree.DNull},
 		{raw: mysql.NewStrLiteral([]byte("2010-01-01")), typ: types.Date, want: date("2010-01-01")},
+		{raw: mysql.NewStrLiteral([]byte("Asdf")), typ: types.MakeCollatedString(types.String, "en_US"), want: asdfCS},
 		{raw: mysql.NewStrLiteral([]byte("0000-00-00 00:00:00")), typ: types.Timestamp, want: tree.DNull},
 		{raw: mysql.NewStrLiteral([]byte("2010-01-01 00:00:00")), typ: types.Timestamp, want: ts("2010-01-01 00:00:00")},
 	}

diff --git a/pkg/sql/importer/read_import_mysqlout.go b/pkg/sql/importer/read_import_mysqlout.go
@@ -23,7 +23,6 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/security/username"
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog"
 	"github.com/cockroachdb/cockroach/pkg/sql/row"
-	"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
@@ -258,12 +257,8 @@ func (d *delimitedConsumer) FillDatums(
 		} else if (!d.opts.HasEscape && field == "NULL") || d.opts.NullEncoding != nil && field == *d.opts.NullEncoding {
 			datum = tree.DNull
 		} else {
-			// This uses ParseDatumStringAsWithRawBytes instead of ParseDatumStringAs since mysql emits
-			// raw byte strings that do not use the same escaping as our ParseBytes
-			// function expects, and the difference between ParseStringAs and
-			// ParseDatumStringAs is whether or not it attempts to parse bytes.
 			var err error
-			datum, err = rowenc.ParseDatumStringAsWithRawBytes(ctx, conv.VisibleColTypes[datumIdx], field, conv.EvalCtx)
+			datum, err = mysqlStrToDatum(conv.EvalCtx, field, conv.VisibleColTypes[datumIdx])
 			if err != nil {
 				col := conv.VisibleCols[datumIdx]
 				return newImportRowError(

diff --git a/pkg/sql/rowenc/roundtrip_format.go b/pkg/sql/rowenc/roundtrip_format.go
@@ -35,22 +35,6 @@ func ParseDatumStringAs(
 	}
 }
 
-// ParseDatumStringAsWithRawBytes parses s as type t. However, if the requested type is Bytes
-// then the string is returned unchanged. This function is used when the input string might be
-// unescaped raw bytes, so we don't want to run a bytes parsing routine on the input. Other
-// than the bytes case, this function does the same as ParseDatumStringAs but is not
-// guaranteed to round-trip.
-func ParseDatumStringAsWithRawBytes(
-	ctx context.Context, t *types.T, s string, evalCtx *eval.Context,
-) (tree.Datum, error) {
-	switch t.Family() {
-	case types.BytesFamily:
-		return tree.NewDBytes(tree.DBytes(s)), nil
-	default:
-		return ParseDatumStringAs(ctx, t, s, evalCtx)
-	}
-}
-
 func parseAsTyp(
 	ctx context.Context, evalCtx *eval.Context, typ *types.T, s string,
 ) (tree.Datum, error) {