-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
statistics_builder.go
3331 lines (2931 loc) · 116 KB
/
statistics_builder.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package memo
import (
"math"
"reflect"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/json"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
)
var statsAnnID = opt.NewTableAnnID()
// statisticsBuilder is responsible for building the statistics that are
// used by the coster to estimate the cost of expressions.
//
// Background
// ----------
//
// Conceptually, there are two kinds of statistics: table statistics and
// relational expression statistics.
//
// 1. Table statistics
//
// Table statistics are stats derived from the underlying data in the
// database. These stats are calculated either automatically or on-demand for
// each table, and include the number of rows in the table as well as
// statistics about selected individual columns or sets of columns. The column
// statistics include the number of null values, the number of distinct values,
// and optionally, a histogram of the data distribution (only applicable for
// single columns, not sets of columns). These stats are only collected
// periodically to avoid overloading the database, so they may be stale. They
// are currently persisted in the system.table_statistics table (see sql/stats
// for details). Inside the optimizer, they are cached in a props.Statistics
// object as a table annotation in opt.Metadata.
//
// 2. Relational expression statistics
//
// Relational expression statistics are derived from table statistics, and are
// only valid for a particular memo group. They are used to estimate how the
// underlying table statistics change as different relational operators are
// applied. The same types of statistics are stored for relational expressions
// as for tables (row count, null count, distinct count, etc.). Inside the
// optimizer, they are stored in a props.Statistics object in the logical
// properties of the relational expression's memo group.
//
// For example, here is a query plan with corresponding estimated statistics at
// each level:
//
// Query: SELECT y FROM a WHERE x=1
//
// Plan: Project y Row Count: 10, Distinct(x): 1
// |
// Select x=1 Row Count: 10, Distinct(x): 1
// |
// Scan a Row Count: 100, Distinct(x): 10
//
// The statistics for the Scan operator were presumably retrieved from the
// underlying table statistics cached in the metadata. The statistics for
// the Select operator are determined as follows: Since the predicate x=1
// reduces the number of distinct values of x down to 1, and the previous
// distinct count of x was 10, the selectivity of the predicate is 1/10.
// Thus, the estimated number of output rows is 1/10 * 100 = 10. Finally, the
// Project operator passes through the statistics from its child expression.
//
// Statistics for expressions high up in the query tree tend to be quite
// inaccurate since the estimation errors from lower expressions are
// compounded. Still, statistics are useful throughout the query tree to help
// the optimizer choose between multiple alternative, logically equivalent
// plans.
//
// How statisticsBuilder works
// ---------------------------
//
// statisticsBuilder is responsible for building the second type of statistics,
// relational expression statistics. It builds the statistics lazily, and only
// calculates column statistics if needed to estimate the row count of an
// expression (currently, the row count is the only statistic used by the
// coster).
//
// Every relational operator has a buildXXX and a colStatXXX function. For
// example, Scan has buildScan and colStatScan. buildScan is called when the
// logical properties of a Scan expression are built. The goal of each buildXXX
// function is to calculate the number of rows output by the expression so that
// its cost can be estimated by the coster.
//
// In order to determine the row count, column statistics may be required for a
// subset of the columns of the expression. Column statistics are calculated
// recursively from the child expression(s) via calls to the colStatFromInput
// function. colStatFromInput finds the child expression that might contain the
// requested stats, and calls colStat on the child. colStat checks if the
// requested stats are already cached for the child expression, and if not,
// calls colStatXXX (where the XXX corresponds to the operator of the child
// expression). The child expression may need to calculate column statistics
// from its children, and if so, it makes another recursive call to
// colStatFromInput.
//
// The "base case" for colStatFromInput is a Scan, where the "input" is the raw
// table itself; the table statistics are retrieved from the metadata (the
// metadata may in turn need to fetch the stats from the database if they are
// not already cached). If a particular table statistic is not available, a
// best-effort guess is made (see colStatLeaf for details).
//
// To better understand how the statisticsBuilder works, let us consider this
// simple query, which consists of a scan followed by an aggregation:
//
// SELECT count(*), x, y FROM t GROUP BY x, y
//
// The statistics for the scan of t will be calculated first, since logical
// properties are built bottom-up. The estimated row count is retrieved from
// the table statistics in the metadata, so no column statistics are needed.
//
// The statistics for the group by operator are calculated second. The row
// count for GROUP BY can be determined by the distinct count of its grouping
// columns. Therefore, the statisticsBuilder recursively updates the statistics
// for the scan operator to include column stats for x and y, and then uses
// these column stats to update the statistics for GROUP BY.
//
// At each stage where column statistics are requested, the statisticsBuilder
// makes a call to colStatFromChild, which in turn calls colStat on the child
// to retrieve the cached statistics or calculate them recursively. Assuming
// that no statistics are cached, this is the order of function calls for the
// above example (somewhat simplified):
//
// +-------------+ +--------------+
// 1. | buildScan t | 2. | buildGroupBy |
// +-------------+ +--------------+
// | |
// +-----------------------+ +-------------------------+
// | makeTableStatistics t | | colStatFromChild (x, y) |
// +-----------------------+ +-------------------------+
// |
// +--------------------+
// | colStatScan (x, y) |
// +--------------------+
// |
// +---------------------+
// | colStatTable (x, y) |
// +---------------------+
// |
// +--------------------+
// | colStatLeaf (x, y) |
// +--------------------+
//
// See props/statistics.go for more details.
type statisticsBuilder struct {
evalCtx *tree.EvalContext
md *opt.Metadata
}
func (sb *statisticsBuilder) init(evalCtx *tree.EvalContext, md *opt.Metadata) {
sb.evalCtx = evalCtx
sb.md = md
}
func (sb *statisticsBuilder) clear() {
sb.evalCtx = nil
sb.md = nil
}
// colStatFromChild retrieves a column statistic from a specific child of the
// given expression.
func (sb *statisticsBuilder) colStatFromChild(
colSet opt.ColSet, e RelExpr, childIdx int,
) *props.ColumnStatistic {
// Helper function to return the column statistic if the output columns of
// the child with the given index intersect colSet.
child := e.Child(childIdx).(RelExpr)
childProps := child.Relational()
if !colSet.SubsetOf(childProps.OutputCols) {
colSet = colSet.Intersection(childProps.OutputCols)
if colSet.Empty() {
// All the columns in colSet are outer columns; therefore, we can treat
// them as a constant.
return &props.ColumnStatistic{Cols: colSet, DistinctCount: 1}
}
}
return sb.colStat(colSet, child)
}
// statsFromChild retrieves the main statistics struct from a specific child
// of the given expression.
func (sb *statisticsBuilder) statsFromChild(e RelExpr, childIdx int) *props.Statistics {
return &e.Child(childIdx).(RelExpr).Relational().Stats
}
// availabilityFromInput determines the availability of the underlying table
// statistics from the children of the expression.
func (sb *statisticsBuilder) availabilityFromInput(e RelExpr) bool {
switch t := e.(type) {
case *ScanExpr:
return sb.makeTableStatistics(t.Table).Available
case *VirtualScanExpr:
return sb.makeTableStatistics(t.Table).Available
case *LookupJoinExpr:
ensureLookupJoinInputProps(t, sb)
return t.lookupProps.Stats.Available && t.Input.Relational().Stats.Available
case *ZigzagJoinExpr:
ensureZigzagJoinInputProps(t, sb)
return t.leftProps.Stats.Available
}
available := true
for i, n := 0, e.ChildCount(); i < n; i++ {
if child, ok := e.Child(i).(RelExpr); ok {
available = available && child.Relational().Stats.Available
}
}
return available
}
// colStatFromInput retrieves a column statistic from the input(s) of a Scan,
// Select, or Join. The input to the Scan is the "raw" table.
func (sb *statisticsBuilder) colStatFromInput(colSet opt.ColSet, e RelExpr) *props.ColumnStatistic {
var lookupJoin *LookupJoinExpr
var zigzagJoin *ZigzagJoinExpr
switch t := e.(type) {
case *ScanExpr:
return sb.colStatTable(t.Table, colSet)
case *VirtualScanExpr:
return sb.colStatTable(t.Table, colSet)
case *SelectExpr:
return sb.colStatFromChild(colSet, t, 0)
case *LookupJoinExpr:
lookupJoin = t
ensureLookupJoinInputProps(lookupJoin, sb)
case *ZigzagJoinExpr:
zigzagJoin = t
ensureZigzagJoinInputProps(zigzagJoin, sb)
}
if lookupJoin != nil || zigzagJoin != nil || opt.IsJoinOp(e) || e.Op() == opt.MergeJoinOp {
var leftProps *props.Relational
if zigzagJoin != nil {
leftProps = &zigzagJoin.leftProps
} else {
leftProps = e.Child(0).(RelExpr).Relational()
}
intersectsLeft := leftProps.OutputCols.Intersects(colSet)
var intersectsRight bool
if lookupJoin != nil {
intersectsRight = lookupJoin.lookupProps.OutputCols.Intersects(colSet)
} else if zigzagJoin != nil {
intersectsRight = zigzagJoin.rightProps.OutputCols.Intersects(colSet)
} else {
intersectsRight = e.Child(1).(RelExpr).Relational().OutputCols.Intersects(colSet)
}
// It's possible that colSet intersects both left and right if we have a
// lookup join that was converted from an index join, so check the left
// side first.
if intersectsLeft {
if zigzagJoin != nil {
return sb.colStatTable(zigzagJoin.LeftTable, colSet)
}
return sb.colStatFromChild(colSet, e, 0 /* childIdx */)
}
if intersectsRight {
if lookupJoin != nil {
return sb.colStatTable(lookupJoin.Table, colSet)
}
if zigzagJoin != nil {
return sb.colStatTable(zigzagJoin.RightTable, colSet)
}
return sb.colStatFromChild(colSet, e, 1 /* childIdx */)
}
// All columns in colSet are outer columns; therefore, we can treat them
// as a constant.
return &props.ColumnStatistic{Cols: colSet, DistinctCount: 1}
}
panic(errors.AssertionFailedf("unsupported operator type %s", log.Safe(e.Op())))
}
// colStat gets a column statistic for the given set of columns if it exists.
// If the column statistic is not available in the current expression, colStat
// recursively tries to find it in the children of the expression, lazily
// populating s.ColStats with the statistic as it gets passed up the expression
// tree.
func (sb *statisticsBuilder) colStat(colSet opt.ColSet, e RelExpr) *props.ColumnStatistic {
if colSet.Empty() {
panic(errors.AssertionFailedf("column statistics cannot be determined for empty column set"))
}
// Check if the requested column statistic is already cached.
if stat, ok := e.Relational().Stats.ColStats.Lookup(colSet); ok {
return stat
}
// We only calculate statistics on the normalized expression in a memo group.
e = e.FirstExpr()
// The statistic was not found in the cache, so calculate it based on the
// type of expression.
switch e.Op() {
case opt.ScanOp:
return sb.colStatScan(colSet, e.(*ScanExpr))
case opt.VirtualScanOp:
return sb.colStatVirtualScan(colSet, e.(*VirtualScanExpr))
case opt.SelectOp:
return sb.colStatSelect(colSet, e.(*SelectExpr))
case opt.ProjectOp:
return sb.colStatProject(colSet, e.(*ProjectExpr))
case opt.ValuesOp:
return sb.colStatValues(colSet, e.(*ValuesExpr))
case opt.InnerJoinOp, opt.LeftJoinOp, opt.RightJoinOp, opt.FullJoinOp,
opt.SemiJoinOp, opt.AntiJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinApplyOp,
opt.SemiJoinApplyOp, opt.AntiJoinApplyOp, opt.MergeJoinOp, opt.LookupJoinOp,
opt.ZigzagJoinOp:
return sb.colStatJoin(colSet, e)
case opt.IndexJoinOp:
return sb.colStatIndexJoin(colSet, e.(*IndexJoinExpr))
case opt.UnionOp, opt.IntersectOp, opt.ExceptOp,
opt.UnionAllOp, opt.IntersectAllOp, opt.ExceptAllOp:
return sb.colStatSetNode(colSet, e)
case opt.GroupByOp, opt.ScalarGroupByOp, opt.DistinctOnOp:
return sb.colStatGroupBy(colSet, e)
case opt.LimitOp:
return sb.colStatLimit(colSet, e.(*LimitExpr))
case opt.OffsetOp:
return sb.colStatOffset(colSet, e.(*OffsetExpr))
case opt.Max1RowOp:
return sb.colStatMax1Row(colSet, e.(*Max1RowExpr))
case opt.OrdinalityOp:
return sb.colStatOrdinality(colSet, e.(*OrdinalityExpr))
case opt.WindowOp:
return sb.colStatWindow(colSet, e.(*WindowExpr))
case opt.ProjectSetOp:
return sb.colStatProjectSet(colSet, e.(*ProjectSetExpr))
case opt.InsertOp, opt.UpdateOp, opt.UpsertOp, opt.DeleteOp:
return sb.colStatMutation(colSet, e)
case opt.SequenceSelectOp:
return sb.colStatSequenceSelect(colSet, e.(*SequenceSelectExpr))
case opt.ExplainOp, opt.ShowTraceForSessionOp,
opt.OpaqueRelOp, opt.OpaqueMutationOp, opt.OpaqueDDLOp:
return sb.colStatUnknown(colSet, e.Relational())
case opt.WithOp:
return sb.colStat(colSet, e.Child(1).(RelExpr))
case opt.WithScanOp:
// This is tricky, since if we deferred to the expression being referenced,
// the computation of stats for a WithScan would depend on something
// outside of the expression itself. Just call it unknown for now.
// TODO(justin): find a real solution for this.
return sb.colStatUnknown(colSet, e.Relational())
case opt.FakeRelOp:
panic(errors.AssertionFailedf("FakeRelOp does not contain col stat for %v", colSet))
}
panic(errors.AssertionFailedf("unrecognized relational expression type: %v", log.Safe(e.Op())))
}
// colStatLeaf creates a column statistic for a given column set (if it doesn't
// already exist in s), by deriving the statistic from the general statistics.
// Used when there is no child expression to retrieve statistics from, typically
// with the Statistics derived for a table.
func (sb *statisticsBuilder) colStatLeaf(
colSet opt.ColSet, s *props.Statistics, fd *props.FuncDepSet, notNullCols opt.ColSet,
) *props.ColumnStatistic {
// Ensure that the requested column statistic is in the cache.
colStat, added := s.ColStats.Add(colSet)
if !added {
// Already in the cache.
return colStat
}
// If some of the columns are a lax key, the distinct count equals the row
// count. The null count is 0 if these columns are not nullable, otherwise
// copy the null count from the nullable columns in colSet.
if fd.ColsAreLaxKey(colSet) {
if colSet.SubsetOf(notNullCols) {
colStat.NullCount = 0
} else {
nullableCols := colSet.Difference(notNullCols)
if nullableCols.Equals(colSet) {
// No column statistics on this colSet - use the unknown
// null count ratio.
colStat.NullCount = s.RowCount * unknownNullCountRatio
} else {
colStatLeaf := sb.colStatLeaf(nullableCols, s, fd, notNullCols)
// Fetch the colStat again since it may now have a different address.
colStat, _ = s.ColStats.Lookup(colSet)
colStat.NullCount = colStatLeaf.NullCount
}
}
// Only one of the null values counts towards the distinct count.
colStat.DistinctCount = s.RowCount - max(colStat.NullCount-1, 0)
return colStat
}
if colSet.Len() == 1 {
col, _ := colSet.Next(0)
colStat.DistinctCount = unknownDistinctCountRatio * s.RowCount
colStat.NullCount = unknownNullCountRatio * s.RowCount
if sb.md.ColumnMeta(col).Type.Family() == types.BoolFamily {
colStat.DistinctCount = min(colStat.DistinctCount, 2)
}
if notNullCols.Contains(col) {
colStat.NullCount = 0
}
} else {
distinctCount := 1.0
nullCount := 0.0
colSet.ForEach(func(i opt.ColumnID) {
colStatLeaf := sb.colStatLeaf(opt.MakeColSet(i), s, fd, notNullCols)
distinctCount *= colStatLeaf.DistinctCount
if nullCount < s.RowCount {
// Subtract the expected chance of collisions with nulls already collected.
nullCount += colStatLeaf.NullCount * (1 - nullCount/s.RowCount)
}
})
// Fetch the colStat again since it may now have a different address.
colStat, _ = s.ColStats.Lookup(colSet)
colStat.DistinctCount = min(distinctCount, s.RowCount)
colStat.NullCount = min(nullCount, s.RowCount)
}
return colStat
}
// +-------+
// | Table |
// +-------+
// makeTableStatistics returns the available statistics for the given table.
// Statistics are derived lazily and are cached in the metadata, since they may
// be accessed multiple times during query optimization. For more details, see
// props.Statistics.
func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Statistics {
stats, ok := sb.md.TableAnnotation(tabID, statsAnnID).(*props.Statistics)
if ok {
// Already made.
return stats
}
// Make now and annotate the metadata table with it for next time.
tab := sb.md.Table(tabID)
stats = &props.Statistics{}
if tab.StatisticCount() == 0 {
// No statistics.
stats.Available = false
stats.RowCount = unknownRowCount
} else {
// Get the RowCount from the most recent statistic. Stats are ordered
// with most recent first.
stats.Available = true
stats.RowCount = float64(tab.Statistic(0).RowCount())
// Make sure the row count is at least 1. The stats may be stale, and we
// can end up with weird and inefficient plans if we estimate 0 rows.
stats.RowCount = max(stats.RowCount, 1)
// Add all the column statistics, using the most recent statistic for each
// column set. Stats are ordered with most recent first.
for i := 0; i < tab.StatisticCount(); i++ {
stat := tab.Statistic(i)
var cols opt.ColSet
for i := 0; i < stat.ColumnCount(); i++ {
cols.Add(tabID.ColumnID(stat.ColumnOrdinal(i)))
}
if colStat, ok := stats.ColStats.Add(cols); ok {
colStat.DistinctCount = float64(stat.DistinctCount())
colStat.NullCount = float64(stat.NullCount())
if cols.Len() == 1 && stat.Histogram() != nil {
col, _ := cols.Next(0)
colStat.Histogram = &props.Histogram{}
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
}
// Make sure the distinct count is at least 1, for the same reason as
// the row count above.
colStat.DistinctCount = max(colStat.DistinctCount, 1)
}
}
}
sb.md.SetTableAnnotation(tabID, statsAnnID, stats)
return stats
}
func (sb *statisticsBuilder) colStatTable(
tabID opt.TableID, colSet opt.ColSet,
) *props.ColumnStatistic {
tableStats := sb.makeTableStatistics(tabID)
tableFD := MakeTableFuncDep(sb.md, tabID)
tableNotNullCols := tableNotNullCols(sb.md, tabID)
return sb.colStatLeaf(colSet, tableStats, tableFD, tableNotNullCols)
}
// +------+
// | Scan |
// +------+
func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relational) {
s := &relProps.Stats
if zeroCardinality := s.Init(relProps); zeroCardinality {
// Short cut if cardinality is 0.
return
}
s.Available = sb.availabilityFromInput(scan)
inputStats := sb.makeTableStatistics(scan.Table)
s.RowCount = inputStats.RowCount
if scan.Constraint != nil {
// Calculate distinct counts and histograms for constrained columns
// ----------------------------------------------------------------
var numUnappliedConjuncts float64
var constrainedCols, histCols opt.ColSet
// Inverted indexes are a special case; a constraint like:
// /1: [/'{"a": "b"}' - /'{"a": "b"}']
// does not necessarily mean there is only going to be one distinct
// value for column 1, if it is being applied to an inverted index.
// This is because inverted index keys could correspond to partial
// column values, such as one path-to-a-leaf through a JSON object.
//
// For now, don't apply constraints on inverted index columns.
if sb.md.Table(scan.Table).Index(scan.Index).IsInverted() {
for i, n := 0, scan.Constraint.ConstrainedColumns(sb.evalCtx); i < n; i++ {
numUnappliedConjuncts += sb.numConjunctsInConstraint(scan.Constraint, i)
}
} else {
constrainedCols, histCols = sb.applyIndexConstraint(scan.Constraint, scan, relProps)
}
// Calculate row count and selectivity
// -----------------------------------
inputRowCount := s.RowCount
s.ApplySelectivity(sb.selectivityFromHistograms(histCols, scan, s))
s.ApplySelectivity(sb.selectivityFromDistinctCounts(constrainedCols.Difference(histCols), scan, s))
s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
// Set null counts to 0 for non-nullable columns
// -------------------------------------------------
sb.updateNullCountsFromProps(scan, relProps, inputStats.RowCount)
s.ApplySelectivity(sb.selectivityFromNullCounts(constrainedCols, scan, s, inputRowCount))
}
sb.finalizeFromCardinality(relProps)
}
func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *props.ColumnStatistic {
relProps := scan.Relational()
s := &relProps.Stats
inputColStat := sb.colStatTable(scan.Table, colSet)
colStat := sb.copyColStat(colSet, s, inputColStat)
if sb.shouldUseHistogram(relProps) {
colStat.Histogram = inputColStat.Histogram
}
if s.Selectivity != 1 {
tableStats := sb.makeTableStatistics(scan.Table)
colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount)
}
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}
// +-------------+
// | VirtualScan |
// +-------------+
func (sb *statisticsBuilder) buildVirtualScan(scan *VirtualScanExpr, relProps *props.Relational) {
s := &relProps.Stats
if zeroCardinality := s.Init(relProps); zeroCardinality {
// Short cut if cardinality is 0.
return
}
s.Available = sb.availabilityFromInput(scan)
inputStats := sb.makeTableStatistics(scan.Table)
s.RowCount = inputStats.RowCount
sb.finalizeFromCardinality(relProps)
}
func (sb *statisticsBuilder) colStatVirtualScan(
colSet opt.ColSet, scan *VirtualScanExpr,
) *props.ColumnStatistic {
s := &scan.Relational().Stats
colStat := sb.copyColStat(colSet, s, sb.colStatTable(scan.Table, colSet))
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}
// +--------+
// | Select |
// +--------+
func (sb *statisticsBuilder) buildSelect(sel *SelectExpr, relProps *props.Relational) {
s := &relProps.Stats
if zeroCardinality := s.Init(relProps); zeroCardinality {
// Short cut if cardinality is 0.
return
}
s.Available = sb.availabilityFromInput(sel)
// Update stats based on equivalencies in the filter conditions. Note that
// EquivReps from the Select FD should not be used, as they include
// equivalencies derived from input expressions.
var equivFD props.FuncDepSet
for i := range sel.Filters {
equivFD.AddEquivFrom(&sel.Filters[i].ScalarProps(sel.Memo()).FuncDeps)
}
equivReps := equivFD.EquivReps()
// Calculate distinct counts and histograms for constrained columns
// ----------------------------------------------------------------
numUnappliedConjuncts, constrainedCols, histCols := sb.applyFilter(sel.Filters, sel, relProps)
// Try to reduce the number of columns used for selectivity
// calculation based on functional dependencies.
inputFD := &sel.Input.Relational().FuncDeps
nonReducedCols := constrainedCols
constrainedCols = sb.tryReduceCols(constrainedCols, s, inputFD)
// Calculate selectivity and row count
// -----------------------------------
inputStats := &sel.Input.Relational().Stats
s.RowCount = inputStats.RowCount
inputRowCount := s.RowCount
s.ApplySelectivity(sb.selectivityFromHistograms(histCols, sel, s))
s.ApplySelectivity(sb.selectivityFromDistinctCounts(constrainedCols.Difference(histCols), sel, s))
s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &relProps.FuncDeps, sel, s))
s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
// Update distinct counts based on equivalencies; this should happen after
// selectivityFromDistinctCounts and selectivityFromEquivalencies.
sb.applyEquivalencies(equivReps, &relProps.FuncDeps, sel, relProps)
// Update null counts for non-nullable columns.
sb.updateNullCountsFromProps(sel, relProps, inputRowCount)
// Note that for null count selectivity calculations, we use the un-reduced constraint
// columns. This is so we don't miss any null-rejecting filters. For example, in this
// query: SELECT min(y) FROM xyz WHERE x = 1 , where y is nullable and x is a key (so
// x -> y in the FD set), we still need to be able to catch the null rejection property
// on y inferred by the min aggregation and modify the row count accordingly.
//
// TODO(itsbilal): Calculate the one column in each FD group that yields the most
// selective (i.e. lowest selectivity value) from its null count reduction, and use
// that instead of just calculating selectivities from all columns. The current
// solution could easily lead to double-counting of null selectivities across
// multiple correlated columns.
s.ApplySelectivity(sb.selectivityFromNullCounts(nonReducedCols, sel, s, inputRowCount))
sb.finalizeFromCardinality(relProps)
}
func (sb *statisticsBuilder) colStatSelect(
colSet opt.ColSet, sel *SelectExpr,
) *props.ColumnStatistic {
relProps := sel.Relational()
s := &relProps.Stats
inputStats := &sel.Input.Relational().Stats
colStat := sb.copyColStatFromChild(colSet, sel, s)
// It's not safe to use s.Selectivity, because it's possible that some of the
// filter conditions were pushed down into the input after s.Selectivity
// was calculated. For example, an index scan or index join created during
// exploration could absorb some of the filter conditions.
selectivity := s.RowCount / inputStats.RowCount
colStat.ApplySelectivity(selectivity, inputStats.RowCount)
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}
// +---------+
// | Project |
// +---------+
func (sb *statisticsBuilder) buildProject(prj *ProjectExpr, relProps *props.Relational) {
s := &relProps.Stats
if zeroCardinality := s.Init(relProps); zeroCardinality {
// Short cut if cardinality is 0.
return
}
s.Available = sb.availabilityFromInput(prj)
inputStats := &prj.Input.Relational().Stats
s.RowCount = inputStats.RowCount
sb.finalizeFromCardinality(relProps)
}
func (sb *statisticsBuilder) colStatProject(
colSet opt.ColSet, prj *ProjectExpr,
) *props.ColumnStatistic {
relProps := prj.Relational()
s := &relProps.Stats
// Columns may be passed through from the input, or they may reference a
// higher scope (in the case of a correlated subquery), or they
// may be synthesized by the projection operation.
inputCols := prj.Input.Relational().OutputCols
reqInputCols := colSet.Intersection(inputCols)
nullOpFound := false
if reqSynthCols := colSet.Difference(inputCols); !reqSynthCols.Empty() {
// Some of the columns in colSet were synthesized or from a higher scope
// (in the case of a correlated subquery). We assume that the statistics of
// the synthesized columns are the same as the statistics of their input
// columns. For example, the distinct count of (x + 2) is the same as the
// distinct count of x.
// TODO(rytaft): This assumption breaks down for certain types of
// expressions, such as (x < y).
for i := range prj.Projections {
item := &prj.Projections[i]
if reqSynthCols.Contains(item.Col) {
reqInputCols.UnionWith(item.scalar.OuterCols)
// If the element is a null constant, account for that
// when calculating null counts.
if item.Element.Op() == opt.NullOp {
nullOpFound = true
}
}
}
// Intersect with the input columns one more time to remove any columns
// from higher scopes. Columns from higher scopes are effectively constant
// in this scope, and therefore have distinct count = 1.
reqInputCols.IntersectionWith(inputCols)
}
colStat, _ := s.ColStats.Add(colSet)
if !reqInputCols.Empty() {
// Inherit column statistics from input, using the reqInputCols identified
// above.
inputColStat := sb.colStatFromChild(reqInputCols, prj, 0 /* childIdx */)
colStat.DistinctCount = inputColStat.DistinctCount
if nullOpFound {
colStat.NullCount = s.RowCount
} else {
colStat.NullCount = inputColStat.NullCount
}
} else {
// There are no columns in this expression, so it must be a constant.
colStat.DistinctCount = 1
if nullOpFound {
colStat.NullCount = s.RowCount
} else {
colStat.NullCount = 0
}
}
if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
sb.finalizeFromRowCount(colStat, s.RowCount)
return colStat
}
// +------+
// | Join |
// +------+
func (sb *statisticsBuilder) buildJoin(
join RelExpr, relProps *props.Relational, h *joinPropsHelper,
) {
// Zigzag joins have their own stats builder case.
if join.Op() == opt.ZigzagJoinOp {
sb.buildZigzagJoin(join.(*ZigzagJoinExpr), relProps, h)
return
}
s := &relProps.Stats
if zeroCardinality := s.Init(relProps); zeroCardinality {
// Short cut if cardinality is 0.
return
}
s.Available = sb.availabilityFromInput(join)
leftStats := &h.leftProps.Stats
rightStats := &h.rightProps.Stats
leftCols := h.leftProps.OutputCols.Copy()
rightCols := h.rightProps.OutputCols.Copy()
equivReps := h.filtersFD.EquivReps()
// Shortcut if there are no ON conditions. Note that for lookup join, there
// are implicit equality conditions on KeyCols.
if h.filterIsTrue {
s.RowCount = leftStats.RowCount * rightStats.RowCount
s.Selectivity = 1
switch h.joinType {
case opt.InnerJoinOp, opt.InnerJoinApplyOp:
case opt.LeftJoinOp, opt.LeftJoinApplyOp:
// All rows from left side should be in the result.
s.RowCount = max(s.RowCount, leftStats.RowCount)
case opt.RightJoinOp:
// All rows from right side should be in the result.
s.RowCount = max(s.RowCount, rightStats.RowCount)
case opt.FullJoinOp:
// All rows from both sides should be in the result.
s.RowCount = max(s.RowCount, leftStats.RowCount)
s.RowCount = max(s.RowCount, rightStats.RowCount)
case opt.SemiJoinOp, opt.SemiJoinApplyOp:
s.RowCount = leftStats.RowCount
case opt.AntiJoinOp, opt.AntiJoinApplyOp:
s.RowCount = 0
s.Selectivity = 0
}
return
}
// Shortcut if the ON condition is false or there is a contradiction.
if h.filters.IsFalse() {
s.Selectivity = 0
switch h.joinType {
case opt.InnerJoinOp, opt.InnerJoinApplyOp:
s.RowCount = 0
case opt.LeftJoinOp, opt.LeftJoinApplyOp:
// All rows from left side should be in the result.
s.RowCount = leftStats.RowCount
case opt.RightJoinOp:
// All rows from right side should be in the result.
s.RowCount = rightStats.RowCount
case opt.FullJoinOp:
// All rows from both sides should be in the result.
s.RowCount = leftStats.RowCount + rightStats.RowCount
case opt.SemiJoinOp, opt.SemiJoinApplyOp:
s.RowCount = 0
case opt.AntiJoinOp, opt.AntiJoinApplyOp:
s.RowCount = leftStats.RowCount
s.Selectivity = 1
}
return
}
// Calculate distinct counts for constrained columns in the ON conditions
// ----------------------------------------------------------------------
// TODO(rytaft): use histogram for joins.
numUnappliedConjuncts, constrainedCols, _ := sb.applyFilter(h.filters, join, relProps)
// Try to reduce the number of columns used for selectivity
// calculation based on functional dependencies.
constrainedCols = sb.tryReduceJoinCols(
constrainedCols,
s,
h.leftProps.OutputCols,
h.rightProps.OutputCols,
&h.leftProps.FuncDeps,
&h.rightProps.FuncDeps,
)
// Calculate selectivity and row count
// -----------------------------------
s.RowCount = leftStats.RowCount * rightStats.RowCount
// Save the initial row count before ON conditions are applied.
inputRowCount := s.RowCount
switch h.joinType {
case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
// Treat anti join as if it were a semi join for the selectivity
// calculations. It will be fixed below.
s.RowCount = leftStats.RowCount
inputRowCount = leftStats.RowCount
s.ApplySelectivity(sb.selectivityFromEquivalenciesSemiJoin(
equivReps, h.leftProps.OutputCols, h.rightProps.OutputCols, &h.filtersFD, join, s,
))
default:
if h.rightProps.FuncDeps.ColsAreStrictKey(h.selfJoinCols) {
// This is like an index join, so apply a selectivity that will result
// in leftStats.RowCount rows.
s.ApplySelectivity(1 / rightStats.RowCount)
} else {
// Add the self join columns to equivReps so they are included in the
// calculation for selectivityFromEquivalencies below.
equivReps.UnionWith(h.selfJoinCols)
}
s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s))
}
s.ApplySelectivity(sb.selectivityFromDistinctCounts(constrainedCols, join, s))
s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
// Update distinct counts based on equivalencies; this should happen after
// selectivityFromDistinctCounts and selectivityFromEquivalencies.
sb.applyEquivalencies(equivReps, &h.filtersFD, join, relProps)
// Update null counts for non-nullable columns.
sb.updateNullCountsFromProps(join, relProps, inputRowCount)
switch h.joinType {
case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
// Keep only column stats from the left side.
s.ColStats.RemoveIntersecting(h.rightProps.OutputCols)
s.ApplySelectivity(sb.selectivityFromNullCounts(constrainedCols, join, s, inputRowCount))
default:
s.ApplySelectivity(sb.joinSelectivityFromNullCounts(
constrainedCols,
join,
s,
inputRowCount,
leftCols,
leftStats.RowCount,
rightCols,
rightStats.RowCount,
))
}
// The above calculation is for inner joins. Other joins need to remove stats
// that involve outer columns.
switch h.joinType {
case opt.LeftJoinOp, opt.LeftJoinApplyOp:
// Keep only column stats from the right side. The stats from the left side
// are not valid.
s.ColStats.RemoveIntersecting(h.leftProps.OutputCols)
case opt.RightJoinOp:
// Keep only column stats from the left side. The stats from the right side
// are not valid.
s.ColStats.RemoveIntersecting(h.rightProps.OutputCols)
case opt.FullJoinOp:
// Do not keep any column stats.
s.ColStats.Clear()
}
// Tweak the row count.
innerJoinRowCount := s.RowCount
switch h.joinType {
case opt.LeftJoinOp, opt.LeftJoinApplyOp:
// All rows from left side should be in the result.
s.RowCount = max(innerJoinRowCount, leftStats.RowCount)
case opt.RightJoinOp:
// All rows from right side should be in the result.
s.RowCount = max(innerJoinRowCount, rightStats.RowCount)
case opt.FullJoinOp:
// All rows from both sides should be in the result.
// T(A FOJ B) = T(A LOJ B) + T(A ROJ B) - T(A IJ B)
leftJoinRowCount := max(innerJoinRowCount, leftStats.RowCount)
rightJoinRowCount := max(innerJoinRowCount, rightStats.RowCount)
s.RowCount = leftJoinRowCount + rightJoinRowCount - innerJoinRowCount