importccl: write stub table statistics during import

At the end of an import job we notify the StatsRefresher that it should collect statistics for the tables we imported into, but it can take a while for those statistics to actually be collected. Meanwhile, if the table is new, the optimizer will be operating totally blind. To make this period slightly less painful, write stub statistics at the end of import consisting of only row counts. Release note (performance improvement): collect basic table statistics during import, to help the optimizer until full statistics collection completes.
cockroachdb · Sep 15, 2021 · 4741230 · 4741230
1 parent 2baefd3
commit 4741230
Show file tree

Hide file tree

Showing 6 changed files with 122 additions and 15 deletions.
diff --git a/pkg/ccl/importccl/BUILD.bazel b/pkg/ccl/importccl/BUILD.bazel
@@ -55,6 +55,7 @@ go_library(
         "//pkg/sql/faketreeeval",
         "//pkg/sql/gcjob",
         "//pkg/sql/lexbase",
+        "//pkg/sql/opt/memo",
         "//pkg/sql/parser",
         "//pkg/sql/pgwire/pgcode",
         "//pkg/sql/pgwire/pgerror",
@@ -67,6 +68,7 @@ go_library(
         "//pkg/sql/sem/tree",
         "//pkg/sql/sessiondata",
         "//pkg/sql/sqltelemetry",
+        "//pkg/sql/stats",
         "//pkg/sql/types",
         "//pkg/util",
         "//pkg/util/bufalloc",

diff --git a/pkg/ccl/importccl/import_stmt.go b/pkg/ccl/importccl/import_stmt.go
@@ -49,12 +49,14 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/catalog/tabledesc"
 	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
 	"github.com/cockroachdb/cockroach/pkg/sql/gcjob"
+	"github.com/cockroachdb/cockroach/pkg/sql/opt/memo"
 	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
 	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
 	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice"
 	"github.com/cockroachdb/cockroach/pkg/sql/privilege"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
+	"github.com/cockroachdb/cockroach/pkg/sql/stats"
 	"github.com/cockroachdb/cockroach/pkg/util"
 	"github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -2139,7 +2141,7 @@ func (r *importResumer) Resume(ctx context.Context, execCtx interface{}) error {
 		return err
 	}
 
-	if err := r.publishTables(ctx, p.ExecCfg()); err != nil {
+	if err := r.publishTables(ctx, p.ExecCfg(), res); err != nil {
 		return err
 	}
 
@@ -2326,7 +2328,9 @@ func (r *importResumer) checkForUDTModification(
 }
 
 // publishTables updates the status of imported tables from OFFLINE to PUBLIC.
-func (r *importResumer) publishTables(ctx context.Context, execCfg *sql.ExecutorConfig) error {
+func (r *importResumer) publishTables(
+	ctx context.Context, execCfg *sql.ExecutorConfig, res roachpb.BulkOpSummary,
+) error {
 	details := r.job.Details().(jobspb.ImportDetails)
 	// Tables should only be published once.
 	if details.TablesPublished {
@@ -2378,6 +2382,34 @@ func (r *importResumer) publishTables(ctx context.Context, execCfg *sql.Executor
 			return errors.Wrap(err, "publishing tables")
 		}
 
+		// Write "stub" statistics for new tables, which should be good enough to use
+		// until the full CREATE STATISTICS run finishes.
+		for _, tbl := range details.Tables {
+			if tbl.IsNew {
+				desc := tabledesc.NewUnsafeImmutable(tbl.Desc)
+				id := roachpb.BulkOpSummaryID(uint64(desc.GetID()), uint64(desc.GetPrimaryIndexID()))
+				rowCount := uint64(res.EntryCounts[id])
+				distinctCount := uint64(float64(rowCount) * memo.UnknownDistinctCountRatio)
+				nullCount := uint64(float64(rowCount) * memo.UnknownNullCountRatio)
+				statistics, err := sql.StubTableStats(execCfg.Settings, desc, jobspb.ImportStatsName)
+				if err == nil {
+					for _, statistic := range statistics {
+						statistic.RowCount = rowCount
+						statistic.DistinctCount = distinctCount
+						statistic.NullCount = nullCount
+					}
+					err = stats.InsertNewStats(ctx, execCfg.InternalExecutor, txn, statistics)
+				}
+				if err != nil {
+					// Failure to create statistics should not fail the entire import.
+					log.Warningf(
+						ctx, "error while creating stub statistics during import of %q: %v",
+						desc.GetName(), err,
+					)
+				}
+			}
+		}
+
 		// Update job record to mark tables published state as complete.
 		details.TablesPublished = true
 		err := r.job.SetDetails(ctx, txn, details)

diff --git a/pkg/ccl/importccl/import_stmt_test.go b/pkg/ccl/importccl/import_stmt_test.go
@@ -1113,6 +1113,55 @@ CREATE TABLE t (a duration);
 			typ:    "CSV",
 			err:    `"s" not found`,
 		},
+		{
+			name:   "statistics collection",
+			create: "a INT",
+			typ:    "CSV",
+			data: "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n" +
+				"10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n" +
+				"20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n" +
+				"30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n" +
+				"40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n" +
+				"50\n51\n52\n53\n54\n55\n56\n57\n58\n59\n" +
+				"60\n61\n62\n63\n64\n65\n66\n67\n68\n69\n" +
+				"70\n71\n72\n73\n74\n75\n76\n77\n78\n79\n" +
+				"80\n81\n82\n83\n84\n85\n86\n87\n88\n89\n" +
+				"90\n91\n92\n93\n94\n95\n96\n97\n98\n99\n",
+			query: map[string][][]string{
+				"SELECT column_names, row_count, distinct_count, null_count " +
+					"FROM [SHOW STATISTICS FOR TABLE t] " +
+					"WHERE statistics_name = '__import__' " +
+					"ORDER BY column_names": {
+					{"{a}", "100", "10", "1"},
+					{"{rowid}", "100", "10", "1"},
+				},
+			},
+		},
+		{
+			name:   "statistics collection multi",
+			create: "a INT PRIMARY KEY, b INT, INDEX (b, a)",
+			typ:    "CSV",
+			data: "0,0\n1,1\n2,2\n3,3\n4,4\n5,5\n6,6\n7,7\n8,8\n9,9\n" +
+				"10,10\n11,11\n12,12\n13,13\n14,14\n15,15\n16,16\n17,17\n18,18\n19,19\n" +
+				"20,20\n21,21\n22,22\n23,23\n24,24\n25,25\n26,26\n27,27\n28,28\n29,29\n" +
+				"30,30\n31,31\n32,32\n33,33\n34,34\n35,35\n36,36\n37,37\n38,38\n39,39\n" +
+				"40,40\n41,41\n42,42\n43,43\n44,44\n45,45\n46,46\n47,47\n48,48\n49,49\n" +
+				"50,50\n51,51\n52,52\n53,53\n54,54\n55,55\n56,56\n57,57\n58,58\n59,59\n" +
+				"60,60\n61,61\n62,62\n63,63\n64,64\n65,65\n66,66\n67,67\n68,68\n69,69\n" +
+				"70,70\n71,71\n72,72\n73,73\n74,74\n75,75\n76,76\n77,77\n78,78\n79,79\n" +
+				"80,80\n81,81\n82,82\n83,83\n84,84\n85,85\n86,86\n87,87\n88,88\n89,89\n" +
+				"90,90\n91,91\n92,92\n93,93\n94,94\n95,95\n96,96\n97,97\n98,98\n99,99",
+			query: map[string][][]string{
+				"SELECT column_names, row_count, distinct_count, null_count " +
+					"FROM [SHOW STATISTICS FOR TABLE t] " +
+					"WHERE statistics_name = '__import__' " +
+					"ORDER BY column_names": {
+					{"{a}", "100", "10", "1"},
+					{"{b}", "100", "10", "1"},
+					{"{b,a}", "100", "10", "1"},
+				},
+			},
+		},
 	}
 
 	var mockRecorder struct {

diff --git a/pkg/jobs/jobspb/wrap.go b/pkg/jobs/jobspb/wrap.go
@@ -71,6 +71,10 @@ var _ base.SQLInstanceID
 // running CREATE STATISTICS manually.
 const AutoStatsName = "__auto__"
 
+// ImportStatsName is the name to use for statistics created automatically
+// during import.
+const ImportStatsName = "__import__"
+
 // AutomaticJobTypes is a list of automatic job types that currently exist.
 var AutomaticJobTypes = [...]Type{
 	TypeAutoCreateStats,

diff --git a/pkg/sql/create_stats.go b/pkg/sql/create_stats.go
@@ -62,6 +62,26 @@ var featureStatsEnabled = settings.RegisterBoolSetting(
 const defaultHistogramBuckets = 200
 const nonIndexColHistogramBuckets = 2
 
+// StubTableStats generates "stub" statistics for a table which are missing
+// histograms and have 0 for all values.
+func StubTableStats(
+	st *cluster.Settings, desc catalog.TableDescriptor, name string,
+) ([]*stats.TableStatisticProto, error) {
+	colStats, err := createStatsDefaultColumns(st, desc)
+	if err != nil {
+		return nil, err
+	}
+	statistics := make([]*stats.TableStatisticProto, len(colStats))
+	for i, colStat := range colStats {
+		statistics[i] = &stats.TableStatisticProto{
+			TableID:   desc.GetID(),
+			Name:      name,
+			ColumnIDs: colStat.ColumnIDs,
+		}
+	}
+	return statistics, nil
+}
+
 // createStatsNode is a planNode implemented in terms of a function. The
 // startJob function starts a Job during Start, and the remainder of the
 // CREATE STATISTICS planning and execution is performed within the jobs
@@ -215,8 +235,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
 	// Identify which columns we should create statistics for.
 	var colStats []jobspb.CreateStatsDetails_ColStat
 	if len(n.ColumnNames) == 0 {
-		multiColEnabled := stats.MultiColumnStatisticsClusterMode.Get(&n.p.ExecCfg().Settings.SV)
-		if colStats, err = createStatsDefaultColumns(tableDesc, multiColEnabled); err != nil {
+		if colStats, err = createStatsDefaultColumns(n.p.ExecCfg().Settings, tableDesc); err != nil {
 			return nil, err
 		}
 	} else {
@@ -314,8 +333,9 @@ const maxNonIndexCols = 100
 // other columns from the table. We only collect histograms for index columns,
 // plus any other boolean or enum columns (where the "histogram" is tiny).
 func createStatsDefaultColumns(
-	desc catalog.TableDescriptor, multiColEnabled bool,
+	st *cluster.Settings, desc catalog.TableDescriptor,
 ) ([]jobspb.CreateStatsDetails_ColStat, error) {
+	multiColEnabled := stats.MultiColumnStatisticsClusterMode.Get(&st.SV)
 	colStats := make([]jobspb.CreateStatsDetails_ColStat, 0, len(desc.ActiveIndexes()))
 
 	requestedStats := make(map[string]struct{})

diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -445,7 +445,7 @@ func (sb *statisticsBuilder) colStatLeaf(
 			if nullableCols.Equals(colSet) {
 				// No column statistics on this colSet - use the unknown
 				// null count ratio.
-				colStat.NullCount = s.RowCount * unknownNullCountRatio
+				colStat.NullCount = s.RowCount * UnknownNullCountRatio
 			} else {
 				colStatLeaf := sb.colStatLeaf(nullableCols, s, fd, notNullCols)
 				// Fetch the colStat again since it may now have a different address.
@@ -460,8 +460,8 @@ func (sb *statisticsBuilder) colStatLeaf(
 
 	if colSet.Len() == 1 {
 		col, _ := colSet.Next(0)
-		colStat.DistinctCount = unknownDistinctCountRatio * s.RowCount
-		colStat.NullCount = unknownNullCountRatio * s.RowCount
+		colStat.DistinctCount = UnknownDistinctCountRatio * s.RowCount
+		colStat.NullCount = UnknownNullCountRatio * s.RowCount
 		if notNullCols.Contains(col) {
 			colStat.NullCount = 0
 		}
@@ -2121,7 +2121,7 @@ func (sb *statisticsBuilder) colStatMax1Row(
 	s := &max1Row.Relational().Stats
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = 1
-	colStat.NullCount = s.RowCount * unknownNullCountRatio
+	colStat.NullCount = s.RowCount * UnknownNullCountRatio
 	if colSet.Intersects(max1Row.Relational().NotNullCols) {
 		colStat.NullCount = 0
 	}
@@ -2319,7 +2319,7 @@ func (sb *statisticsBuilder) colStatProjectSet(
 					// requested columns correspond to, and estimate the distinct count and
 					// null count based on the type of generator function and its parameters.
 					zipColsDistinctCount *= unknownGeneratorRowCount * unknownGeneratorDistinctCountRatio
-					zipColsNullCount *= unknownNullCountRatio
+					zipColsNullCount *= UnknownNullCountRatio
 				} else {
 					// The columns(s) contain a scalar function or expression.
 					// These columns can contain many null values if the zip also
@@ -2346,7 +2346,7 @@ func (sb *statisticsBuilder) colStatProjectSet(
 				if item.ScalarProps().OuterCols.Intersects(inputProps.OutputCols) {
 					// The column(s) are correlated with the input, so they may have a
 					// distinct value for each distinct row of the input.
-					zipColsDistinctCount *= inputStats.RowCount * unknownDistinctCountRatio
+					zipColsDistinctCount *= inputStats.RowCount * UnknownDistinctCountRatio
 				}
 			}
 		}
@@ -2791,11 +2791,11 @@ const (
 	// This is the ratio of distinct column values to number of rows, which is
 	// used in the absence of any real statistics for non-key columns.
 	// TODO(rytaft): See if there is an industry standard value for this.
-	unknownDistinctCountRatio = 0.1
+	UnknownDistinctCountRatio = 0.1
 
 	// This is the ratio of null column values to number of rows for nullable
 	// columns, which is used in the absence of any real statistics.
-	unknownNullCountRatio = 0.01
+	UnknownNullCountRatio = 0.01
 
 	// Use a small row count for generator functions; this allows use of lookup
 	// join in cases like using json_array_elements with a small constant array.
@@ -3198,7 +3198,7 @@ func (sb *statisticsBuilder) applyIndexConstraint(
 		numConjuncts := sb.numConjunctsInConstraint(c, i)
 
 		// Set the distinct count for the current column of the constraint
-		// according to unknownDistinctCountRatio.
+		// according to UnknownDistinctCountRatio.
 		var lowerBound float64
 		if i == applied {
 			lowerBound = lastColMinDistinct
@@ -3250,7 +3250,7 @@ func (sb *statisticsBuilder) applyConstraintSet(
 			numConjuncts := sb.numConjunctsInConstraint(c, 0 /* nth */)
 
 			// Set the distinct count for the first column of the constraint
-			// according to unknownDistinctCountRatio.
+			// according to UnknownDistinctCountRatio.
 			sb.updateDistinctCountFromUnappliedConjuncts(col, e, s, numConjuncts, lastColMinDistinct)
 		}