diff --git a/pkg/ccl/importccl/BUILD.bazel b/pkg/ccl/importccl/BUILD.bazel index 1bb0b6c739a5..e6a7ec044b2b 100644 --- a/pkg/ccl/importccl/BUILD.bazel +++ b/pkg/ccl/importccl/BUILD.bazel @@ -55,6 +55,7 @@ go_library( "//pkg/sql/faketreeeval", "//pkg/sql/gcjob", "//pkg/sql/lexbase", + "//pkg/sql/opt/memo", "//pkg/sql/parser", "//pkg/sql/pgwire/pgcode", "//pkg/sql/pgwire/pgerror", @@ -67,6 +68,7 @@ go_library( "//pkg/sql/sem/tree", "//pkg/sql/sessiondata", "//pkg/sql/sqltelemetry", + "//pkg/sql/stats", "//pkg/sql/types", "//pkg/util", "//pkg/util/bufalloc", diff --git a/pkg/ccl/importccl/import_stmt.go b/pkg/ccl/importccl/import_stmt.go index a156600e4089..e8687813abaf 100644 --- a/pkg/ccl/importccl/import_stmt.go +++ b/pkg/ccl/importccl/import_stmt.go @@ -49,12 +49,14 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/catalog/tabledesc" "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" "github.com/cockroachdb/cockroach/pkg/sql/gcjob" + "github.com/cockroachdb/cockroach/pkg/sql/opt/memo" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry" + "github.com/cockroachdb/cockroach/pkg/sql/stats" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented" "github.com/cockroachdb/cockroach/pkg/util/hlc" @@ -2139,7 +2141,7 @@ func (r *importResumer) Resume(ctx context.Context, execCtx interface{}) error { return err } - if err := r.publishTables(ctx, p.ExecCfg()); err != nil { + if err := r.publishTables(ctx, p.ExecCfg(), res); err != nil { return err } @@ -2326,7 +2328,9 @@ func (r *importResumer) checkForUDTModification( } // publishTables updates the status of imported tables from OFFLINE to PUBLIC. -func (r *importResumer) publishTables(ctx context.Context, execCfg *sql.ExecutorConfig) error { +func (r *importResumer) publishTables( + ctx context.Context, execCfg *sql.ExecutorConfig, res roachpb.BulkOpSummary, +) error { details := r.job.Details().(jobspb.ImportDetails) // Tables should only be published once. if details.TablesPublished { @@ -2378,6 +2382,39 @@ func (r *importResumer) publishTables(ctx context.Context, execCfg *sql.Executor return errors.Wrap(err, "publishing tables") } + // Write "stub" statistics for new tables, which should be good enough to use + // until the full CREATE STATISTICS run finishes. + for _, tbl := range details.Tables { + if tbl.IsNew { + desc := tabledesc.NewUnsafeImmutable(tbl.Desc) + id := roachpb.BulkOpSummaryID(uint64(desc.GetID()), uint64(desc.GetPrimaryIndexID())) + rowCount := uint64(res.EntryCounts[id]) + // TODO(michae2): collect distinct and null counts during import. + distinctCount := uint64(float64(rowCount) * memo.UnknownDistinctCountRatio) + nullCount := uint64(float64(rowCount) * memo.UnknownNullCountRatio) + // Because we don't yet have real distinct and null counts, only produce + // single-column stats to avoid the appearance of perfectly correlated + // columns. + multiColEnabled := false + statistics, err := sql.StubTableStats(desc, jobspb.ImportStatsName, multiColEnabled) + if err == nil { + for _, statistic := range statistics { + statistic.RowCount = rowCount + statistic.DistinctCount = distinctCount + statistic.NullCount = nullCount + } + err = stats.InsertNewStats(ctx, execCfg.InternalExecutor, txn, statistics) + } + if err != nil { + // Failure to create statistics should not fail the entire import. + log.Warningf( + ctx, "error while creating statistics during import of %q: %v", + desc.GetName(), err, + ) + } + } + } + // Update job record to mark tables published state as complete. details.TablesPublished = true err := r.job.SetDetails(ctx, txn, details) diff --git a/pkg/ccl/importccl/import_stmt_test.go b/pkg/ccl/importccl/import_stmt_test.go index cba806322a6a..b10e2af6f997 100644 --- a/pkg/ccl/importccl/import_stmt_test.go +++ b/pkg/ccl/importccl/import_stmt_test.go @@ -1113,6 +1113,54 @@ CREATE TABLE t (a duration); typ: "CSV", err: `"s" not found`, }, + { + name: "statistics collection", + create: "a INT", + typ: "CSV", + data: "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n" + + "10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n" + + "20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n" + + "30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n" + + "40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n" + + "50\n51\n52\n53\n54\n55\n56\n57\n58\n59\n" + + "60\n61\n62\n63\n64\n65\n66\n67\n68\n69\n" + + "70\n71\n72\n73\n74\n75\n76\n77\n78\n79\n" + + "80\n81\n82\n83\n84\n85\n86\n87\n88\n89\n" + + "90\n91\n92\n93\n94\n95\n96\n97\n98\n99\n", + query: map[string][][]string{ + "SELECT column_names, row_count, distinct_count, null_count " + + "FROM [SHOW STATISTICS FOR TABLE t] " + + "WHERE statistics_name = '__import__' " + + "ORDER BY column_names": { + {"{a}", "100", "10", "1"}, + {"{rowid}", "100", "10", "1"}, + }, + }, + }, + { + name: "statistics collection multi", + create: "a INT PRIMARY KEY, b INT, INDEX (b, a)", + typ: "CSV", + data: "0,0\n1,1\n2,2\n3,3\n4,4\n5,5\n6,6\n7,7\n8,8\n9,9\n" + + "10,10\n11,11\n12,12\n13,13\n14,14\n15,15\n16,16\n17,17\n18,18\n19,19\n" + + "20,20\n21,21\n22,22\n23,23\n24,24\n25,25\n26,26\n27,27\n28,28\n29,29\n" + + "30,30\n31,31\n32,32\n33,33\n34,34\n35,35\n36,36\n37,37\n38,38\n39,39\n" + + "40,40\n41,41\n42,42\n43,43\n44,44\n45,45\n46,46\n47,47\n48,48\n49,49\n" + + "50,50\n51,51\n52,52\n53,53\n54,54\n55,55\n56,56\n57,57\n58,58\n59,59\n" + + "60,60\n61,61\n62,62\n63,63\n64,64\n65,65\n66,66\n67,67\n68,68\n69,69\n" + + "70,70\n71,71\n72,72\n73,73\n74,74\n75,75\n76,76\n77,77\n78,78\n79,79\n" + + "80,80\n81,81\n82,82\n83,83\n84,84\n85,85\n86,86\n87,87\n88,88\n89,89\n" + + "90,90\n91,91\n92,92\n93,93\n94,94\n95,95\n96,96\n97,97\n98,98\n99,99", + query: map[string][][]string{ + "SELECT column_names, row_count, distinct_count, null_count " + + "FROM [SHOW STATISTICS FOR TABLE t] " + + "WHERE statistics_name = '__import__' " + + "ORDER BY column_names": { + {"{a}", "100", "10", "1"}, + {"{b}", "100", "10", "1"}, + }, + }, + }, } var mockRecorder struct { diff --git a/pkg/jobs/jobspb/wrap.go b/pkg/jobs/jobspb/wrap.go index 245ec02b60d0..6b33da89f6a3 100644 --- a/pkg/jobs/jobspb/wrap.go +++ b/pkg/jobs/jobspb/wrap.go @@ -71,6 +71,10 @@ var _ base.SQLInstanceID // running CREATE STATISTICS manually. const AutoStatsName = "__auto__" +// ImportStatsName is the name to use for statistics created automatically +// during import. +const ImportStatsName = "__import__" + // AutomaticJobTypes is a list of automatic job types that currently exist. var AutomaticJobTypes = [...]Type{ TypeAutoCreateStats, diff --git a/pkg/sql/create_stats.go b/pkg/sql/create_stats.go index fa948a81efc5..217496594a2e 100644 --- a/pkg/sql/create_stats.go +++ b/pkg/sql/create_stats.go @@ -62,6 +62,26 @@ var featureStatsEnabled = settings.RegisterBoolSetting( const defaultHistogramBuckets = 200 const nonIndexColHistogramBuckets = 2 +// StubTableStats generates "stub" statistics for a table which are missing +// histograms and have 0 for all values. +func StubTableStats( + desc catalog.TableDescriptor, name string, multiColEnabled bool, +) ([]*stats.TableStatisticProto, error) { + colStats, err := createStatsDefaultColumns(desc, multiColEnabled) + if err != nil { + return nil, err + } + statistics := make([]*stats.TableStatisticProto, len(colStats)) + for i, colStat := range colStats { + statistics[i] = &stats.TableStatisticProto{ + TableID: desc.GetID(), + Name: name, + ColumnIDs: colStat.ColumnIDs, + } + } + return statistics, nil +} + // createStatsNode is a planNode implemented in terms of a function. The // startJob function starts a Job during Start, and the remainder of the // CREATE STATISTICS planning and execution is performed within the jobs diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index e2c464ca0b30..4bab922ebaec 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -445,7 +445,7 @@ func (sb *statisticsBuilder) colStatLeaf( if nullableCols.Equals(colSet) { // No column statistics on this colSet - use the unknown // null count ratio. - colStat.NullCount = s.RowCount * unknownNullCountRatio + colStat.NullCount = s.RowCount * UnknownNullCountRatio } else { colStatLeaf := sb.colStatLeaf(nullableCols, s, fd, notNullCols) // Fetch the colStat again since it may now have a different address. @@ -460,8 +460,8 @@ func (sb *statisticsBuilder) colStatLeaf( if colSet.Len() == 1 { col, _ := colSet.Next(0) - colStat.DistinctCount = unknownDistinctCountRatio * s.RowCount - colStat.NullCount = unknownNullCountRatio * s.RowCount + colStat.DistinctCount = UnknownDistinctCountRatio * s.RowCount + colStat.NullCount = UnknownNullCountRatio * s.RowCount if notNullCols.Contains(col) { colStat.NullCount = 0 } @@ -2121,7 +2121,7 @@ func (sb *statisticsBuilder) colStatMax1Row( s := &max1Row.Relational().Stats colStat, _ := s.ColStats.Add(colSet) colStat.DistinctCount = 1 - colStat.NullCount = s.RowCount * unknownNullCountRatio + colStat.NullCount = s.RowCount * UnknownNullCountRatio if colSet.Intersects(max1Row.Relational().NotNullCols) { colStat.NullCount = 0 } @@ -2319,7 +2319,7 @@ func (sb *statisticsBuilder) colStatProjectSet( // requested columns correspond to, and estimate the distinct count and // null count based on the type of generator function and its parameters. zipColsDistinctCount *= unknownGeneratorRowCount * unknownGeneratorDistinctCountRatio - zipColsNullCount *= unknownNullCountRatio + zipColsNullCount *= UnknownNullCountRatio } else { // The columns(s) contain a scalar function or expression. // These columns can contain many null values if the zip also @@ -2346,7 +2346,7 @@ func (sb *statisticsBuilder) colStatProjectSet( if item.ScalarProps().OuterCols.Intersects(inputProps.OutputCols) { // The column(s) are correlated with the input, so they may have a // distinct value for each distinct row of the input. - zipColsDistinctCount *= inputStats.RowCount * unknownDistinctCountRatio + zipColsDistinctCount *= inputStats.RowCount * UnknownDistinctCountRatio } } } @@ -2791,11 +2791,11 @@ const ( // This is the ratio of distinct column values to number of rows, which is // used in the absence of any real statistics for non-key columns. // TODO(rytaft): See if there is an industry standard value for this. - unknownDistinctCountRatio = 0.1 + UnknownDistinctCountRatio = 0.1 // This is the ratio of null column values to number of rows for nullable // columns, which is used in the absence of any real statistics. - unknownNullCountRatio = 0.01 + UnknownNullCountRatio = 0.01 // Use a small row count for generator functions; this allows use of lookup // join in cases like using json_array_elements with a small constant array. @@ -3198,7 +3198,7 @@ func (sb *statisticsBuilder) applyIndexConstraint( numConjuncts := sb.numConjunctsInConstraint(c, i) // Set the distinct count for the current column of the constraint - // according to unknownDistinctCountRatio. + // according to UnknownDistinctCountRatio. var lowerBound float64 if i == applied { lowerBound = lastColMinDistinct @@ -3250,7 +3250,7 @@ func (sb *statisticsBuilder) applyConstraintSet( numConjuncts := sb.numConjunctsInConstraint(c, 0 /* nth */) // Set the distinct count for the first column of the constraint - // according to unknownDistinctCountRatio. + // according to UnknownDistinctCountRatio. sb.updateDistinctCountFromUnappliedConjuncts(col, e, s, numConjuncts, lastColMinDistinct) }