importccl: support unique_rowid() as default expression for IMPORT …

…INTO The PR #50295 supports non-targeted columns with constant expression. This PR is a follow up to that in adding support to `unique_rowid()`. This is done by assigning the same value of `rowid` that was generated at the `IMPORT stage (row converter) as the default value. Release note (general change): IMPORT INTO now supports `unique_rowid()` as a default expression.
cockroachdb · Jul 9, 2020 · 6fa2151 · 6fa2151
1 parent e84d0cb
commit 6fa2151
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 24 deletions.
diff --git a/pkg/ccl/importccl/import_stmt_test.go b/pkg/ccl/importccl/import_stmt_test.go
@@ -64,6 +64,39 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+// This checks that the selected columns of a query string have
+// all unique elements. It's useful for checking unique_rowid.
+func checkUnique(allStr [][]string, inds []int) bool {
+	uniqStr := make(map[string]struct{}, len(allStr))
+	for _, slice := range allStr {
+		for _, ind := range inds {
+			s := slice[ind]
+			if _, ok := uniqStr[s]; ok {
+				return false
+			}
+			uniqStr[s] = struct{}{}
+		}
+	}
+	return true
+}
+
+// This checks that the selected columns of a query string have
+// no "NULL" elements. It's useful for checking unique_rowid.
+func checkNoNull(allStr [][]string, inds []int) bool {
+	for _, slice := range allStr {
+		for _, ind := range inds {
+			if slice[ind] == "NULL" {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+func validUniqueRowID(allStr [][]string, inds []int) bool {
+	return checkUnique(allStr, inds) && checkNoNull(allStr, inds)
+}
+
 func TestImportData(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
@@ -2481,8 +2514,10 @@ func TestImportIntoCSV(t *testing.T) {
 	// Test that IMPORT INTO works when columns with default expressions are present.
 	// The default expressions supported by IMPORT INTO are constant expressions,
 	// which are literals and functions that always return the same value given the
-	// same arguments (examples of non-constant expressions are given in the last two
-	// subtests below). The default expression of a column is used when this column is not
+	// same arguments (examples of non-constant expressions are given as now()
+	// and nextval()). `unique_rowid()` is also supported.
+	//
+	// The default expression of a column is used when this column is not
 	// targeted; otherwise, data from source file (like CSV) is used. It also checks
 	// that IMPORT TABLE works when there are default columns.
 	t.Run("import-into-default", func(t *testing.T) {
@@ -2612,6 +2647,24 @@ func TestImportIntoCSV(t *testing.T) {
 			defer sqlDB.Exec(t, `DROP TABLE t`)
 			sqlDB.CheckQueryResults(t, `SELECT * from t`, [][]string{{"2", "42", "1"}, {"4", "42", "3"}})
 		})
+		t.Run("unique_rowid", func(t *testing.T) {
+			sqlDB.Exec(t, `CREATE TABLE t(a INT DEFAULT unique_rowid(), b INT, c STRING, d INT DEFAULT unique_rowid())`)
+			defer sqlDB.Exec(t, `DROP TABLE t`)
+			sqlDB.Exec(t, fmt.Sprintf(`INSERT INTO t (b, c) VALUES (3, 'CAT')`))
+			sqlDB.Exec(t, fmt.Sprintf(`IMPORT INTO t (b, c) CSV DATA (%s)`, strings.Join(testFiles.files, ", ")))
+			sqlDB.Exec(t, fmt.Sprintf(`INSERT INTO t (b, c) VALUES (4, 'DOG')`))
+			IDstr := sqlDB.QueryStr(t, `SELECT a, d FROM t`)
+			require.True(t, validUniqueRowID(IDstr, []int{0, 1}))
+		})
+		t.Run("unique_rowid_with_pk", func(t *testing.T) {
+			sqlDB.Exec(t, `CREATE TABLE t(a INT DEFAULT unique_rowid(), b INT PRIMARY KEY, c STRING)`)
+			defer sqlDB.Exec(t, `DROP TABLE t`)
+			sqlDB.Exec(t, fmt.Sprintf(`INSERT INTO t (b, c) VALUES (-3, 'CAT')`))
+			sqlDB.Exec(t, fmt.Sprintf(`IMPORT INTO t (b, c) CSV DATA (%s)`, strings.Join(testFiles.files, ", ")))
+			sqlDB.Exec(t, fmt.Sprintf(`INSERT INTO t (b, c) VALUES (-4, 'DOG')`))
+			IDstr := sqlDB.QueryStr(t, `SELECT a FROM t`)
+			require.True(t, validUniqueRowID(IDstr, []int{0}))
+		})
 	})
 
 	t.Run("import-not-targeted-not-null", func(t *testing.T) {
@@ -4084,14 +4137,25 @@ func TestImportPgDumpGeo(t *testing.T) {
 
 	// Verify both created tables are identical.
 	importCreate := sqlDB.QueryStr(t, "SELECT create_statement FROM [SHOW CREATE importdb.nyc_census_blocks]")
-	// Families are slightly different due to the geom column being last
-	// in exec and rowid being last in import, so swap that in import to
-	// match exec.
-	importCreate[0][0] = strings.Replace(importCreate[0][0], "geom, rowid", "rowid, geom", 1)
+	// Families are slightly different due to that rowid shows up in exec
+	// but not import (possibly due to the ALTER TABLE statement that makes
+	// gid a primary key), so add that into import to match exec.
+	importCreate[0][0] = strings.Replace(importCreate[0][0], "boroname, geom", "boroname, rowid, geom", 1)
 	sqlDB.CheckQueryResults(t, "SELECT create_statement FROM [SHOW CREATE execdb.nyc_census_blocks]", importCreate)
 
-	importSelect := sqlDB.QueryStr(t, "SELECT * FROM importdb.nyc_census_blocks ORDER BY PRIMARY KEY importdb.nyc_census_blocks")
-	sqlDB.CheckQueryResults(t, "SELECT * FROM execdb.nyc_census_blocks ORDER BY PRIMARY KEY execdb.nyc_census_blocks", importSelect)
+	// Drop the comparison of gid for import vs exec, then check that gid
+	// in import is indeed valid rowid.
+	importCols := "blkid, popn_total, popn_white, popn_black, popn_nativ, popn_asian, popn_other, boroname"
+	importSelect := sqlDB.QueryStr(t, fmt.Sprintf(
+		"SELECT (%s) FROM importdb.nyc_census_blocks ORDER BY PRIMARY KEY importdb.nyc_census_blocks",
+		importCols,
+	))
+	sqlDB.CheckQueryResults(t, fmt.Sprintf(
+		"SELECT (%s) FROM execdb.nyc_census_blocks ORDER BY PRIMARY KEY execdb.nyc_census_blocks",
+		importCols,
+	), importSelect)
+	importID := sqlDB.QueryStr(t, "SELECT (gid) FROM importdb.nyc_census_blocks ORDER BY PRIMARY KEY importdb.nyc_census_blocks")
+	require.True(t, validUniqueRowID(importID, []int{0}))
 }
 
 func TestImportCockroachDump(t *testing.T) {

diff --git a/pkg/ccl/importccl/testdata/pgdump/geo.sql b/pkg/ccl/importccl/testdata/pgdump/geo.sql
@@ -1,11 +1,7 @@
--- The two comments below removing gid are there because IMPORT doesn't
--- support DEFAULT functions (#48253). This function is otherwise exactly
--- what shp2pgsql produces.
-
 SET CLIENT_ENCODING TO UTF8;
 SET STANDARD_CONFORMING_STRINGS TO ON;
 BEGIN;
-CREATE TABLE "nyc_census_blocks" (--gid serial,
+CREATE TABLE "nyc_census_blocks" (gid serial,
 "blkid" varchar(15),
 "popn_total" float8,
 "popn_white" float8,
@@ -14,7 +10,7 @@ CREATE TABLE "nyc_census_blocks" (--gid serial,
 "popn_asian" float8,
 "popn_other" float8,
 "boroname" varchar(32));
---ALTER TABLE "nyc_census_blocks" ADD PRIMARY KEY (gid);
+ALTER TABLE "nyc_census_blocks" ADD PRIMARY KEY (gid);
 SELECT AddGeometryColumn('','nyc_census_blocks','geom','26918','MULTIPOLYGON',2);
 INSERT INTO "nyc_census_blocks" ("blkid","popn_total","popn_white","popn_black","popn_nativ","popn_asian","popn_other","boroname",geom) VALUES ('360850009001000','97','51','32','1','5','8','Staten Island','010600002026690000010000000103000000010000000A00000051AC161881A22141A31409CF1F2A51415F4321458DA2214100102A3F1D2A51418C34807C0BA221414E3E89F5122A5141782D605495A12141780D1CE92A2A51410D1C9C6770A121410F2D6074322A5141441560E0B0A02141A00099C72F2A51412365B4789AA021419F60A7BB342A514160E3E8FA66A0214118B4C0CE402A5141EA4BF3EEC7A12141A3023D61452A514151AC161881A22141A31409CF1F2A5141');
 INSERT INTO "nyc_census_blocks" ("blkid","popn_total","popn_white","popn_black","popn_nativ","popn_asian","popn_other","boroname",geom) VALUES ('360850020011000','66','52','2','0','7','5','Staten Island','0106000020266900000100000001030000000100000007000000083B4A6F79A8214127EC57B49926514151B51BB7CEA72141B2EAD6F38A2651416F429640B9A72141449FCB1C89265141163AA64D56A72141B89E2B7C9B26514150509213EDA72141DCC9A351A826514184FA4C6017A82141B9AE24F0AB265141083B4A6F79A8214127EC57B499265141');

diff --git a/pkg/sql/row/row_converter.go b/pkg/sql/row/row_converter.go
@@ -198,7 +198,7 @@ type DatumRowConverter struct {
 	IsTargetCol map[int]struct{}
 
 	// The rest of these are derived from tableDesc, just cached here.
-	hidden                int
+	rowIDs                []int
 	ri                    Inserter
 	EvalCtx               *tree.EvalContext
 	cols                  []sqlbase.ColumnDescriptor
@@ -302,14 +302,18 @@ func NewDatumRowConverter(
 		_, ok := isTargetColID[col.ID]
 		return ok
 	}
-	c.hidden = -1
+	hidden := -1
+	c.rowIDs = make([]int, 0)
 	for i := range cols {
 		col := &cols[i]
+		if !isTargetCol(col) && col.HasDefault() && col.DefaultExprStr() == "unique_rowid()" {
+			c.rowIDs = append(c.rowIDs, i)
+		}
 		if col.Hidden {
-			if col.DefaultExpr == nil || *col.DefaultExpr != "unique_rowid()" || c.hidden != -1 {
+			if col.DefaultExpr == nil || *col.DefaultExpr != "unique_rowid()" || hidden != -1 {
 				return nil, errors.New("unexpected hidden column")
 			}
-			c.hidden = i
+			hidden = i
 			c.Datums = append(c.Datums, nil)
 		} else {
 			if !isTargetCol(col) && col.DefaultExpr != nil {
@@ -339,8 +343,8 @@ const rowIDBits = 64 - builtins.NodeIDBits
 // Row inserts kv operations into the current kv batch, and triggers a SendBatch
 // if necessary.
 func (c *DatumRowConverter) Row(ctx context.Context, sourceID int32, rowIndex int64) error {
-	if c.hidden >= 0 {
-		// We don't want to call unique_rowid() for the hidden PK column because it
+	for i, pos := range c.rowIDs {
+		// We don't want to call unique_rowid() for columns with such default expressions because it
 		// is not idempotent and has unfortunate overlapping of output spans since
 		// it puts the uniqueness-ensuring per-generator part (nodeID) in the
 		// low-bits. Instead, make our own IDs that attempt to keep each generator
@@ -366,15 +370,24 @@ func (c *DatumRowConverter) Row(ctx context.Context, sourceID int32, rowIndex in
 		// fileIndex*desc.Version) could improve on this. For now, if this
 		// best-effort collision avoidance scheme doesn't work in some cases we can
 		// just recommend an explicit PK as a workaround.
+		//
+		// TODO(anzoteh96): As per the issue in #51004, having too many columns with
+		// default expression unique_rowid() could cause collisions when IMPORTs are run
+		// too close to each other. It will therefore be nice to fix this problem.
 		avoidCollisionsWithSQLsIDs := uint64(1 << 63)
-		rowID := (uint64(sourceID) << rowIDBits) ^ uint64(rowIndex)
-		c.Datums[c.hidden] = tree.NewDInt(tree.DInt(avoidCollisionsWithSQLsIDs | rowID))
+		shiftedRowIndex := int64(len(c.rowIDs))*rowIndex + int64(i)
+		rowID := (uint64(sourceID) << rowIDBits) ^ uint64(shiftedRowIndex)
+		c.Datums[pos] = tree.NewDInt(tree.DInt(avoidCollisionsWithSQLsIDs | rowID))
 	}
 
+	isTargetCol := func(i int) bool {
+		_, ok := c.IsTargetCol[i]
+		return ok
+	}
 	for i := range c.cols {
 		col := &c.cols[i]
-		if _, ok := c.IsTargetCol[i]; !ok && !col.Hidden && col.DefaultExpr != nil {
-			if !tree.IsConst(c.EvalCtx, c.defaultExprs[i]) {
+		if !isTargetCol(i) && !col.Hidden && col.DefaultExpr != nil {
+			if !(*col.DefaultExpr == "unique_rowid()" || tree.IsConst(c.EvalCtx, c.defaultExprs[i])) {
 				// Check if the default expression is a constant expression as we do not
 				// support non-constant default expressions for non-target columns in IMPORT INTO.
 				//