Skip to content

Commit

Permalink
importccl: write stub table statistics during import
Browse files Browse the repository at this point in the history
At the end of an import job we notify the StatsRefresher that it should
collect statistics for the tables we imported into, but it can take a
while for those statistics to actually be collected. Meanwhile, if the
table is new, the optimizer will be operating totally blind. To make
this period slightly less painful, write stub statistics at the end of
import consisting of only row counts.

Release note (performance improvement): collect basic table statistics
during import, to help the optimizer until full statistics collection
completes.
  • Loading branch information
michae2 committed Sep 15, 2021
1 parent 2baefd3 commit 4741230
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 15 deletions.
2 changes: 2 additions & 0 deletions pkg/ccl/importccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ go_library(
"//pkg/sql/faketreeeval",
"//pkg/sql/gcjob",
"//pkg/sql/lexbase",
"//pkg/sql/opt/memo",
"//pkg/sql/parser",
"//pkg/sql/pgwire/pgcode",
"//pkg/sql/pgwire/pgerror",
Expand All @@ -67,6 +68,7 @@ go_library(
"//pkg/sql/sem/tree",
"//pkg/sql/sessiondata",
"//pkg/sql/sqltelemetry",
"//pkg/sql/stats",
"//pkg/sql/types",
"//pkg/util",
"//pkg/util/bufalloc",
Expand Down
36 changes: 34 additions & 2 deletions pkg/ccl/importccl/import_stmt.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,14 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/catalog/tabledesc"
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
"github.com/cockroachdb/cockroach/pkg/sql/gcjob"
"github.com/cockroachdb/cockroach/pkg/sql/opt/memo"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice"
"github.com/cockroachdb/cockroach/pkg/sql/privilege"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
"github.com/cockroachdb/cockroach/pkg/sql/stats"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
Expand Down Expand Up @@ -2139,7 +2141,7 @@ func (r *importResumer) Resume(ctx context.Context, execCtx interface{}) error {
return err
}

if err := r.publishTables(ctx, p.ExecCfg()); err != nil {
if err := r.publishTables(ctx, p.ExecCfg(), res); err != nil {
return err
}

Expand Down Expand Up @@ -2326,7 +2328,9 @@ func (r *importResumer) checkForUDTModification(
}

// publishTables updates the status of imported tables from OFFLINE to PUBLIC.
func (r *importResumer) publishTables(ctx context.Context, execCfg *sql.ExecutorConfig) error {
func (r *importResumer) publishTables(
ctx context.Context, execCfg *sql.ExecutorConfig, res roachpb.BulkOpSummary,
) error {
details := r.job.Details().(jobspb.ImportDetails)
// Tables should only be published once.
if details.TablesPublished {
Expand Down Expand Up @@ -2378,6 +2382,34 @@ func (r *importResumer) publishTables(ctx context.Context, execCfg *sql.Executor
return errors.Wrap(err, "publishing tables")
}

// Write "stub" statistics for new tables, which should be good enough to use
// until the full CREATE STATISTICS run finishes.
for _, tbl := range details.Tables {
if tbl.IsNew {
desc := tabledesc.NewUnsafeImmutable(tbl.Desc)
id := roachpb.BulkOpSummaryID(uint64(desc.GetID()), uint64(desc.GetPrimaryIndexID()))
rowCount := uint64(res.EntryCounts[id])
distinctCount := uint64(float64(rowCount) * memo.UnknownDistinctCountRatio)
nullCount := uint64(float64(rowCount) * memo.UnknownNullCountRatio)
statistics, err := sql.StubTableStats(execCfg.Settings, desc, jobspb.ImportStatsName)
if err == nil {
for _, statistic := range statistics {
statistic.RowCount = rowCount
statistic.DistinctCount = distinctCount
statistic.NullCount = nullCount
}
err = stats.InsertNewStats(ctx, execCfg.InternalExecutor, txn, statistics)
}
if err != nil {
// Failure to create statistics should not fail the entire import.
log.Warningf(
ctx, "error while creating stub statistics during import of %q: %v",
desc.GetName(), err,
)
}
}
}

// Update job record to mark tables published state as complete.
details.TablesPublished = true
err := r.job.SetDetails(ctx, txn, details)
Expand Down
49 changes: 49 additions & 0 deletions pkg/ccl/importccl/import_stmt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,55 @@ CREATE TABLE t (a duration);
typ: "CSV",
err: `"s" not found`,
},
{
name: "statistics collection",
create: "a INT",
typ: "CSV",
data: "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n" +
"10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n" +
"20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n" +
"30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n" +
"40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n" +
"50\n51\n52\n53\n54\n55\n56\n57\n58\n59\n" +
"60\n61\n62\n63\n64\n65\n66\n67\n68\n69\n" +
"70\n71\n72\n73\n74\n75\n76\n77\n78\n79\n" +
"80\n81\n82\n83\n84\n85\n86\n87\n88\n89\n" +
"90\n91\n92\n93\n94\n95\n96\n97\n98\n99\n",
query: map[string][][]string{
"SELECT column_names, row_count, distinct_count, null_count " +
"FROM [SHOW STATISTICS FOR TABLE t] " +
"WHERE statistics_name = '__import__' " +
"ORDER BY column_names": {
{"{a}", "100", "10", "1"},
{"{rowid}", "100", "10", "1"},
},
},
},
{
name: "statistics collection multi",
create: "a INT PRIMARY KEY, b INT, INDEX (b, a)",
typ: "CSV",
data: "0,0\n1,1\n2,2\n3,3\n4,4\n5,5\n6,6\n7,7\n8,8\n9,9\n" +
"10,10\n11,11\n12,12\n13,13\n14,14\n15,15\n16,16\n17,17\n18,18\n19,19\n" +
"20,20\n21,21\n22,22\n23,23\n24,24\n25,25\n26,26\n27,27\n28,28\n29,29\n" +
"30,30\n31,31\n32,32\n33,33\n34,34\n35,35\n36,36\n37,37\n38,38\n39,39\n" +
"40,40\n41,41\n42,42\n43,43\n44,44\n45,45\n46,46\n47,47\n48,48\n49,49\n" +
"50,50\n51,51\n52,52\n53,53\n54,54\n55,55\n56,56\n57,57\n58,58\n59,59\n" +
"60,60\n61,61\n62,62\n63,63\n64,64\n65,65\n66,66\n67,67\n68,68\n69,69\n" +
"70,70\n71,71\n72,72\n73,73\n74,74\n75,75\n76,76\n77,77\n78,78\n79,79\n" +
"80,80\n81,81\n82,82\n83,83\n84,84\n85,85\n86,86\n87,87\n88,88\n89,89\n" +
"90,90\n91,91\n92,92\n93,93\n94,94\n95,95\n96,96\n97,97\n98,98\n99,99",
query: map[string][][]string{
"SELECT column_names, row_count, distinct_count, null_count " +
"FROM [SHOW STATISTICS FOR TABLE t] " +
"WHERE statistics_name = '__import__' " +
"ORDER BY column_names": {
{"{a}", "100", "10", "1"},
{"{b}", "100", "10", "1"},
{"{b,a}", "100", "10", "1"},
},
},
},
}

var mockRecorder struct {
Expand Down
4 changes: 4 additions & 0 deletions pkg/jobs/jobspb/wrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ var _ base.SQLInstanceID
// running CREATE STATISTICS manually.
const AutoStatsName = "__auto__"

// ImportStatsName is the name to use for statistics created automatically
// during import.
const ImportStatsName = "__import__"

// AutomaticJobTypes is a list of automatic job types that currently exist.
var AutomaticJobTypes = [...]Type{
TypeAutoCreateStats,
Expand Down
26 changes: 23 additions & 3 deletions pkg/sql/create_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,26 @@ var featureStatsEnabled = settings.RegisterBoolSetting(
const defaultHistogramBuckets = 200
const nonIndexColHistogramBuckets = 2

// StubTableStats generates "stub" statistics for a table which are missing
// histograms and have 0 for all values.
func StubTableStats(
st *cluster.Settings, desc catalog.TableDescriptor, name string,
) ([]*stats.TableStatisticProto, error) {
colStats, err := createStatsDefaultColumns(st, desc)
if err != nil {
return nil, err
}
statistics := make([]*stats.TableStatisticProto, len(colStats))
for i, colStat := range colStats {
statistics[i] = &stats.TableStatisticProto{
TableID: desc.GetID(),
Name: name,
ColumnIDs: colStat.ColumnIDs,
}
}
return statistics, nil
}

// createStatsNode is a planNode implemented in terms of a function. The
// startJob function starts a Job during Start, and the remainder of the
// CREATE STATISTICS planning and execution is performed within the jobs
Expand Down Expand Up @@ -215,8 +235,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
// Identify which columns we should create statistics for.
var colStats []jobspb.CreateStatsDetails_ColStat
if len(n.ColumnNames) == 0 {
multiColEnabled := stats.MultiColumnStatisticsClusterMode.Get(&n.p.ExecCfg().Settings.SV)
if colStats, err = createStatsDefaultColumns(tableDesc, multiColEnabled); err != nil {
if colStats, err = createStatsDefaultColumns(n.p.ExecCfg().Settings, tableDesc); err != nil {
return nil, err
}
} else {
Expand Down Expand Up @@ -314,8 +333,9 @@ const maxNonIndexCols = 100
// other columns from the table. We only collect histograms for index columns,
// plus any other boolean or enum columns (where the "histogram" is tiny).
func createStatsDefaultColumns(
desc catalog.TableDescriptor, multiColEnabled bool,
st *cluster.Settings, desc catalog.TableDescriptor,
) ([]jobspb.CreateStatsDetails_ColStat, error) {
multiColEnabled := stats.MultiColumnStatisticsClusterMode.Get(&st.SV)
colStats := make([]jobspb.CreateStatsDetails_ColStat, 0, len(desc.ActiveIndexes()))

requestedStats := make(map[string]struct{})
Expand Down
20 changes: 10 additions & 10 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ func (sb *statisticsBuilder) colStatLeaf(
if nullableCols.Equals(colSet) {
// No column statistics on this colSet - use the unknown
// null count ratio.
colStat.NullCount = s.RowCount * unknownNullCountRatio
colStat.NullCount = s.RowCount * UnknownNullCountRatio
} else {
colStatLeaf := sb.colStatLeaf(nullableCols, s, fd, notNullCols)
// Fetch the colStat again since it may now have a different address.
Expand All @@ -460,8 +460,8 @@ func (sb *statisticsBuilder) colStatLeaf(

if colSet.Len() == 1 {
col, _ := colSet.Next(0)
colStat.DistinctCount = unknownDistinctCountRatio * s.RowCount
colStat.NullCount = unknownNullCountRatio * s.RowCount
colStat.DistinctCount = UnknownDistinctCountRatio * s.RowCount
colStat.NullCount = UnknownNullCountRatio * s.RowCount
if notNullCols.Contains(col) {
colStat.NullCount = 0
}
Expand Down Expand Up @@ -2121,7 +2121,7 @@ func (sb *statisticsBuilder) colStatMax1Row(
s := &max1Row.Relational().Stats
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = 1
colStat.NullCount = s.RowCount * unknownNullCountRatio
colStat.NullCount = s.RowCount * UnknownNullCountRatio
if colSet.Intersects(max1Row.Relational().NotNullCols) {
colStat.NullCount = 0
}
Expand Down Expand Up @@ -2319,7 +2319,7 @@ func (sb *statisticsBuilder) colStatProjectSet(
// requested columns correspond to, and estimate the distinct count and
// null count based on the type of generator function and its parameters.
zipColsDistinctCount *= unknownGeneratorRowCount * unknownGeneratorDistinctCountRatio
zipColsNullCount *= unknownNullCountRatio
zipColsNullCount *= UnknownNullCountRatio
} else {
// The columns(s) contain a scalar function or expression.
// These columns can contain many null values if the zip also
Expand All @@ -2346,7 +2346,7 @@ func (sb *statisticsBuilder) colStatProjectSet(
if item.ScalarProps().OuterCols.Intersects(inputProps.OutputCols) {
// The column(s) are correlated with the input, so they may have a
// distinct value for each distinct row of the input.
zipColsDistinctCount *= inputStats.RowCount * unknownDistinctCountRatio
zipColsDistinctCount *= inputStats.RowCount * UnknownDistinctCountRatio
}
}
}
Expand Down Expand Up @@ -2791,11 +2791,11 @@ const (
// This is the ratio of distinct column values to number of rows, which is
// used in the absence of any real statistics for non-key columns.
// TODO(rytaft): See if there is an industry standard value for this.
unknownDistinctCountRatio = 0.1
UnknownDistinctCountRatio = 0.1

// This is the ratio of null column values to number of rows for nullable
// columns, which is used in the absence of any real statistics.
unknownNullCountRatio = 0.01
UnknownNullCountRatio = 0.01

// Use a small row count for generator functions; this allows use of lookup
// join in cases like using json_array_elements with a small constant array.
Expand Down Expand Up @@ -3198,7 +3198,7 @@ func (sb *statisticsBuilder) applyIndexConstraint(
numConjuncts := sb.numConjunctsInConstraint(c, i)

// Set the distinct count for the current column of the constraint
// according to unknownDistinctCountRatio.
// according to UnknownDistinctCountRatio.
var lowerBound float64
if i == applied {
lowerBound = lastColMinDistinct
Expand Down Expand Up @@ -3250,7 +3250,7 @@ func (sb *statisticsBuilder) applyConstraintSet(
numConjuncts := sb.numConjunctsInConstraint(c, 0 /* nth */)

// Set the distinct count for the first column of the constraint
// according to unknownDistinctCountRatio.
// according to UnknownDistinctCountRatio.
sb.updateDistinctCountFromUnappliedConjuncts(col, e, s, numConjuncts, lastColMinDistinct)
}

Expand Down

0 comments on commit 4741230

Please sign in to comment.