-
Notifications
You must be signed in to change notification settings - Fork 3.9k
/
Copy pathcoster.go
1620 lines (1450 loc) · 66.7 KB
/
coster.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package xform
import (
"math"
"math/rand"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
"github.com/cockroachdb/cockroach/pkg/sql/opt/memo"
"github.com/cockroachdb/cockroach/pkg/sql/opt/ordering"
"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
"github.com/cockroachdb/cockroach/pkg/sql/opt/props/physical"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
"golang.org/x/tools/container/intsets"
)
// Coster is used by the optimizer to assign a cost to a candidate expression
// that can provide a set of required physical properties. If a candidate
// expression has a lower cost than any other expression in the memo group, then
// it becomes the new best expression for the group.
//
// The set of costing formulas maintained by the coster for the set of all
// operators constitute the "cost model". A given cost model can be designed to
// maximize any optimization goal, such as:
//
// 1. Max aggregate cluster throughput (txns/sec across cluster)
// 2. Min transaction latency (time to commit txns)
// 3. Min latency to first row (time to get first row of txns)
// 4. Min memory usage
// 5. Some weighted combination of #1 - #4
//
// The cost model in this file targets #1 as the optimization goal. However,
// note that #2 is implicitly important to that goal, since overall cluster
// throughput will suffer if there are lots of pending transactions waiting on
// I/O.
//
// Coster is an interface so that different costing algorithms can be used by
// the optimizer. For example, the OptSteps command uses a custom coster that
// assigns infinite costs to some expressions in order to prevent them from
// being part of the lowest cost tree (for debugging purposes).
type Coster interface {
// ComputeCost returns the estimated cost of executing the candidate
// expression. The optimizer does not expect the cost to correspond to any
// real-world metric, but does expect costs to be comparable to one another,
// as well as summable.
ComputeCost(candidate memo.RelExpr, required *physical.Required) memo.Cost
}
// coster encapsulates the default cost model for the optimizer. The coster
// assigns an estimated cost to each expression in the memo so that the
// optimizer can choose the lowest cost expression tree. The estimated cost is
// a best-effort approximation of the actual cost of execution, based on table
// and index statistics that are propagated throughout the logical expression
// tree.
type coster struct {
evalCtx *tree.EvalContext
mem *memo.Memo
// locality gives the location of the current node as a set of user-defined
// key/value pairs, ordered from most inclusive to least inclusive. If there
// are no tiers, then the node's location is not known. Example:
//
// [region=us,dc=east]
//
locality roachpb.Locality
// perturbation indicates how much to randomly perturb the cost. It is used
// to generate alternative plans for testing. For example, if perturbation is
// 0.5, and the estimated cost of an expression is c, the cost returned by
// ComputeCost will be in the range [c - 0.5 * c, c + 0.5 * c).
perturbation float64
}
var _ Coster = &coster{}
// MakeDefaultCoster creates an instance of the default coster.
func MakeDefaultCoster(mem *memo.Memo) Coster {
return &coster{mem: mem}
}
const (
// These costs have been copied from the Postgres optimizer:
// https://github.com/postgres/postgres/blob/master/src/include/optimizer/cost.h
// TODO(rytaft): "How Good are Query Optimizers, Really?" says that the
// PostgreSQL ratio between CPU and I/O is probably unrealistic in modern
// systems since much of the data can be cached in memory. Consider
// increasing the cpuCostFactor to account for this.
cpuCostFactor = 0.01
seqIOCostFactor = 1
randIOCostFactor = 4
// TODO(justin): make this more sophisticated.
// lookupJoinRetrieveRowCost is the cost to retrieve a single row during a
// lookup join.
// See https://github.com/cockroachdb/cockroach/pull/35561 for the initial
// justification for this constant.
lookupJoinRetrieveRowCost = 2 * seqIOCostFactor
// virtualScanTableDescriptorFetchCost is the cost to retrieve the table
// descriptors when performing a virtual table scan.
virtualScanTableDescriptorFetchCost = 25 * randIOCostFactor
// Input rows to a join are processed in batches of this size.
// See joinreader.go.
joinReaderBatchSize = 100.0
// latencyCostFactor represents the throughput impact of doing scans on an
// index that may be remotely located in a different locality. If latencies
// are higher, then overall cluster throughput will suffer somewhat, as there
// will be more queries in memory blocking on I/O. The impact on throughput
// is expected to be relatively low, so latencyCostFactor is set to a small
// value. However, even a low value will cause the optimizer to prefer
// indexes that are likely to be geographically closer, if they are otherwise
// the same cost to access.
// TODO(andyk): Need to do analysis to figure out right value and/or to come
// up with better way to incorporate latency into the coster.
latencyCostFactor = cpuCostFactor
// hugeCost is used with expressions we want to avoid; these are expressions
// that "violate" a hint like forcing a specific index or join algorithm.
// If the final expression has this cost or larger, it means that there was no
// plan that could satisfy the hints.
hugeCost memo.Cost = 1e100
// fullScanRowCountPenalty adds a penalty to full table scans. This is especially
// useful for empty or very small tables, where we would get plans that are
// surprising to users (like full scans instead of point lookups).
fullScanRowCountPenalty = 10
// unboundedMaxCardinalityScanCostPenalty adds a penalty to scans with
// unbounded maximum cardinality. This helps prevent surprising plans for very
// small tables or for when stats are stale. For full table scans, this
// penalty is added on top of the fullScanRowCountPenalty.
unboundedMaxCardinalityScanCostPenalty = 10
// largeMaxCardinalityScanCostPenalty is the maximum penalty to add to scans
// with a bounded maximum cardinality exceeding the row count estimate. This
// helps prevent surprising plans for very small tables or for when stats are
// stale.
largeMaxCardinalityScanCostPenalty = unboundedMaxCardinalityScanCostPenalty / 2
// preferLookupJoinFactor is a scale factor for the cost of a lookup join when
// we have a hint for preferring a lookup join.
preferLookupJoinFactor = 1e-6
// noSpillRowCount represents the maximum number of rows that should have no
// buffering cost because we expect they will never need to be spilled to
// disk. Since 64MB is the default work mem limit, 64 rows will not cause a
// disk spill unless the rows are at least 1 MB on average.
noSpillRowCount = 64
// spillRowCount represents the minimum number of rows that we expect will
// always need to be spilled to disk. Since 64MB is the default work mem
// limit, 6400000 rows with an average of at least 10 bytes per row will cause
// a disk spill.
spillRowCount = 6400000
// spillCostFactor is the cost of spilling to disk. We use seqIOCostFactor to
// model the cost of spilling to disk, because although there will be some
// random I/O required to insert rows into a sorted structure, the inherent
// batching in the LSM tree should amortize the cost.
spillCostFactor = seqIOCostFactor
)
// fnCost maps some functions to an execution cost. Currently this list
// contains only st_* functions, including some we don't have implemented
// yet. Although function costs differ based on the overload (due to
// arguments), here we are using the minimum from similar functions based on
// postgres' pg_proc table. The following query can be used to generate this table:
// SELECT proname, min(procost) FROM pg_proc WHERE proname LIKE 'st\_%' AND procost > 1 GROUP BY proname ORDER BY proname
// TODO(mjibson): Add costs directly to overloads. When that is done, we should
// also add a test that ensures those costs match postgres.
var fnCost = map[string]memo.Cost{
"st_3dclosestpoint": 1000 * cpuCostFactor,
"st_3ddfullywithin": 10000 * cpuCostFactor,
"st_3ddistance": 1000 * cpuCostFactor,
"st_3ddwithin": 10000 * cpuCostFactor,
"st_3dintersects": 10000 * cpuCostFactor,
"st_3dlength": 100 * cpuCostFactor,
"st_3dlongestline": 1000 * cpuCostFactor,
"st_3dmakebox": 100 * cpuCostFactor,
"st_3dmaxdistance": 1000 * cpuCostFactor,
"st_3dperimeter": 100 * cpuCostFactor,
"st_3dshortestline": 1000 * cpuCostFactor,
"st_addmeasure": 1000 * cpuCostFactor,
"st_addpoint": 100 * cpuCostFactor,
"st_affine": 100 * cpuCostFactor,
"st_angle": 100 * cpuCostFactor,
"st_area": 100 * cpuCostFactor,
"st_area2d": 100 * cpuCostFactor,
"st_asbinary": 100 * cpuCostFactor,
"st_asencodedpolyline": 100 * cpuCostFactor,
"st_asewkb": 100 * cpuCostFactor,
"st_asewkt": 100 * cpuCostFactor,
"st_asgeojson": 100 * cpuCostFactor,
"st_asgml": 100 * cpuCostFactor,
"st_ashexewkb": 100 * cpuCostFactor,
"st_askml": 100 * cpuCostFactor,
"st_aslatlontext": 100 * cpuCostFactor,
"st_assvg": 100 * cpuCostFactor,
"st_astext": 100 * cpuCostFactor,
"st_astwkb": 1000 * cpuCostFactor,
"st_asx3d": 100 * cpuCostFactor,
"st_azimuth": 100 * cpuCostFactor,
"st_bdmpolyfromtext": 100 * cpuCostFactor,
"st_bdpolyfromtext": 100 * cpuCostFactor,
"st_boundary": 1000 * cpuCostFactor,
"st_boundingdiagonal": 100 * cpuCostFactor,
"st_box2dfromgeohash": 1000 * cpuCostFactor,
"st_buffer": 100 * cpuCostFactor,
"st_buildarea": 10000 * cpuCostFactor,
"st_centroid": 100 * cpuCostFactor,
"st_chaikinsmoothing": 10000 * cpuCostFactor,
"st_cleangeometry": 10000 * cpuCostFactor,
"st_clipbybox2d": 10000 * cpuCostFactor,
"st_closestpoint": 1000 * cpuCostFactor,
"st_closestpointofapproach": 10000 * cpuCostFactor,
"st_clusterdbscan": 10000 * cpuCostFactor,
"st_clusterintersecting": 10000 * cpuCostFactor,
"st_clusterkmeans": 10000 * cpuCostFactor,
"st_clusterwithin": 10000 * cpuCostFactor,
"st_collectionextract": 100 * cpuCostFactor,
"st_collectionhomogenize": 100 * cpuCostFactor,
"st_concavehull": 10000 * cpuCostFactor,
"st_contains": 10000 * cpuCostFactor,
"st_containsproperly": 10000 * cpuCostFactor,
"st_convexhull": 10000 * cpuCostFactor,
"st_coorddim": 100 * cpuCostFactor,
"st_coveredby": 100 * cpuCostFactor,
"st_covers": 100 * cpuCostFactor,
"st_cpawithin": 10000 * cpuCostFactor,
"st_createtopogeo": 100 * cpuCostFactor,
"st_crosses": 10000 * cpuCostFactor,
"st_curvetoline": 10000 * cpuCostFactor,
"st_delaunaytriangles": 10000 * cpuCostFactor,
"st_dfullywithin": 10000 * cpuCostFactor,
"st_difference": 10000 * cpuCostFactor,
"st_dimension": 100 * cpuCostFactor,
"st_disjoint": 10000 * cpuCostFactor,
"st_distance": 100 * cpuCostFactor,
"st_distancecpa": 10000 * cpuCostFactor,
"st_distancesphere": 100 * cpuCostFactor,
"st_distancespheroid": 1000 * cpuCostFactor,
"st_dump": 1000 * cpuCostFactor,
"st_dumppoints": 100 * cpuCostFactor,
"st_dumprings": 1000 * cpuCostFactor,
"st_dwithin": 100 * cpuCostFactor,
"st_endpoint": 100 * cpuCostFactor,
"st_envelope": 100 * cpuCostFactor,
"st_equals": 10000 * cpuCostFactor,
"st_expand": 100 * cpuCostFactor,
"st_exteriorring": 100 * cpuCostFactor,
"st_filterbym": 1000 * cpuCostFactor,
"st_findextent": 100 * cpuCostFactor,
"st_flipcoordinates": 1000 * cpuCostFactor,
"st_force2d": 100 * cpuCostFactor,
"st_force3d": 100 * cpuCostFactor,
"st_force3dm": 100 * cpuCostFactor,
"st_force3dz": 100 * cpuCostFactor,
"st_force4d": 100 * cpuCostFactor,
"st_forcecollection": 100 * cpuCostFactor,
"st_forcecurve": 1000 * cpuCostFactor,
"st_forcepolygonccw": 100 * cpuCostFactor,
"st_forcepolygoncw": 1000 * cpuCostFactor,
"st_forcerhr": 1000 * cpuCostFactor,
"st_forcesfs": 1000 * cpuCostFactor,
"st_frechetdistance": 10000 * cpuCostFactor,
"st_generatepoints": 10000 * cpuCostFactor,
"st_geogfromtext": 100 * cpuCostFactor,
"st_geogfromwkb": 100 * cpuCostFactor,
"st_geographyfromtext": 100 * cpuCostFactor,
"st_geohash": 1000 * cpuCostFactor,
"st_geomcollfromtext": 100 * cpuCostFactor,
"st_geomcollfromwkb": 100 * cpuCostFactor,
"st_geometricmedian": 10000 * cpuCostFactor,
"st_geometryfromtext": 1000 * cpuCostFactor,
"st_geometryn": 100 * cpuCostFactor,
"st_geometrytype": 100 * cpuCostFactor,
"st_geomfromewkb": 100 * cpuCostFactor,
"st_geomfromewkt": 100 * cpuCostFactor,
"st_geomfromgeohash": 1000 * cpuCostFactor,
"st_geomfromgeojson": 1000 * cpuCostFactor,
"st_geomfromgml": 100 * cpuCostFactor,
"st_geomfromkml": 1000 * cpuCostFactor,
"st_geomfromtext": 1000 * cpuCostFactor,
"st_geomfromtwkb": 100 * cpuCostFactor,
"st_geomfromwkb": 100 * cpuCostFactor,
"st_gmltosql": 100 * cpuCostFactor,
"st_hasarc": 100 * cpuCostFactor,
"st_hausdorffdistance": 10000 * cpuCostFactor,
"st_inittopogeo": 100 * cpuCostFactor,
"st_interiorringn": 100 * cpuCostFactor,
"st_interpolatepoint": 1000 * cpuCostFactor,
"st_intersection": 100 * cpuCostFactor,
"st_intersects": 100 * cpuCostFactor,
"st_isclosed": 100 * cpuCostFactor,
"st_iscollection": 1000 * cpuCostFactor,
"st_isempty": 100 * cpuCostFactor,
"st_ispolygonccw": 100 * cpuCostFactor,
"st_ispolygoncw": 100 * cpuCostFactor,
"st_isring": 1000 * cpuCostFactor,
"st_issimple": 1000 * cpuCostFactor,
"st_isvalid": 100 * cpuCostFactor,
"st_isvaliddetail": 10000 * cpuCostFactor,
"st_isvalidreason": 100 * cpuCostFactor,
"st_isvalidtrajectory": 10000 * cpuCostFactor,
"st_length": 100 * cpuCostFactor,
"st_length2d": 100 * cpuCostFactor,
"st_length2dspheroid": 1000 * cpuCostFactor,
"st_lengthspheroid": 1000 * cpuCostFactor,
"st_linecrossingdirection": 10000 * cpuCostFactor,
"st_linefromencodedpolyline": 1000 * cpuCostFactor,
"st_linefrommultipoint": 100 * cpuCostFactor,
"st_linefromtext": 100 * cpuCostFactor,
"st_linefromwkb": 100 * cpuCostFactor,
"st_lineinterpolatepoint": 1000 * cpuCostFactor,
"st_lineinterpolatepoints": 1000 * cpuCostFactor,
"st_linelocatepoint": 1000 * cpuCostFactor,
"st_linemerge": 10000 * cpuCostFactor,
"st_linestringfromwkb": 100 * cpuCostFactor,
"st_linesubstring": 1000 * cpuCostFactor,
"st_linetocurve": 10000 * cpuCostFactor,
"st_locatealong": 1000 * cpuCostFactor,
"st_locatebetween": 1000 * cpuCostFactor,
"st_locatebetweenelevations": 1000 * cpuCostFactor,
"st_longestline": 100 * cpuCostFactor,
"st_makeenvelope": 100 * cpuCostFactor,
"st_makeline": 100 * cpuCostFactor,
"st_makepoint": 100 * cpuCostFactor,
"st_makepointm": 100 * cpuCostFactor,
"st_makepolygon": 100 * cpuCostFactor,
"st_makevalid": 10000 * cpuCostFactor,
"st_maxdistance": 100 * cpuCostFactor,
"st_memsize": 100 * cpuCostFactor,
"st_minimumboundingcircle": 10000 * cpuCostFactor,
"st_minimumboundingradius": 10000 * cpuCostFactor,
"st_minimumclearance": 10000 * cpuCostFactor,
"st_minimumclearanceline": 10000 * cpuCostFactor,
"st_mlinefromtext": 100 * cpuCostFactor,
"st_mlinefromwkb": 100 * cpuCostFactor,
"st_mpointfromtext": 100 * cpuCostFactor,
"st_mpointfromwkb": 100 * cpuCostFactor,
"st_mpolyfromtext": 100 * cpuCostFactor,
"st_mpolyfromwkb": 100 * cpuCostFactor,
"st_multi": 100 * cpuCostFactor,
"st_multilinefromwkb": 100 * cpuCostFactor,
"st_multilinestringfromtext": 100 * cpuCostFactor,
"st_multipointfromtext": 100 * cpuCostFactor,
"st_multipointfromwkb": 100 * cpuCostFactor,
"st_multipolyfromwkb": 100 * cpuCostFactor,
"st_multipolygonfromtext": 100 * cpuCostFactor,
"st_node": 10000 * cpuCostFactor,
"st_normalize": 100 * cpuCostFactor,
"st_npoints": 100 * cpuCostFactor,
"st_nrings": 100 * cpuCostFactor,
"st_numgeometries": 100 * cpuCostFactor,
"st_numinteriorring": 100 * cpuCostFactor,
"st_numinteriorrings": 100 * cpuCostFactor,
"st_numpatches": 100 * cpuCostFactor,
"st_numpoints": 100 * cpuCostFactor,
"st_offsetcurve": 10000 * cpuCostFactor,
"st_orderingequals": 10000 * cpuCostFactor,
"st_orientedenvelope": 10000 * cpuCostFactor,
"st_overlaps": 10000 * cpuCostFactor,
"st_patchn": 100 * cpuCostFactor,
"st_perimeter": 100 * cpuCostFactor,
"st_perimeter2d": 100 * cpuCostFactor,
"st_point": 100 * cpuCostFactor,
"st_pointfromgeohash": 1000 * cpuCostFactor,
"st_pointfromtext": 100 * cpuCostFactor,
"st_pointfromwkb": 100 * cpuCostFactor,
"st_pointinsidecircle": 1000 * cpuCostFactor,
"st_pointn": 100 * cpuCostFactor,
"st_pointonsurface": 1000 * cpuCostFactor,
"st_points": 1000 * cpuCostFactor,
"st_polyfromtext": 100 * cpuCostFactor,
"st_polyfromwkb": 100 * cpuCostFactor,
"st_polygon": 100 * cpuCostFactor,
"st_polygonfromtext": 100 * cpuCostFactor,
"st_polygonfromwkb": 100 * cpuCostFactor,
"st_polygonize": 10000 * cpuCostFactor,
"st_project": 1000 * cpuCostFactor,
"st_quantizecoordinates": 1000 * cpuCostFactor,
"st_relate": 10000 * cpuCostFactor,
"st_relatematch": 1000 * cpuCostFactor,
"st_removepoint": 100 * cpuCostFactor,
"st_removerepeatedpoints": 1000 * cpuCostFactor,
"st_reverse": 1000 * cpuCostFactor,
"st_rotate": 100 * cpuCostFactor,
"st_rotatex": 100 * cpuCostFactor,
"st_rotatey": 100 * cpuCostFactor,
"st_rotatez": 100 * cpuCostFactor,
"st_scale": 100 * cpuCostFactor,
"st_segmentize": 1000 * cpuCostFactor,
"st_seteffectivearea": 1000 * cpuCostFactor,
"st_setpoint": 100 * cpuCostFactor,
"st_setsrid": 100 * cpuCostFactor,
"st_sharedpaths": 10000 * cpuCostFactor,
"st_shortestline": 1000 * cpuCostFactor,
"st_simplify": 100 * cpuCostFactor,
"st_simplifypreservetopology": 10000 * cpuCostFactor,
"st_simplifyvw": 10000 * cpuCostFactor,
"st_snap": 10000 * cpuCostFactor,
"st_snaptogrid": 100 * cpuCostFactor,
"st_split": 10000 * cpuCostFactor,
"st_srid": 100 * cpuCostFactor,
"st_startpoint": 100 * cpuCostFactor,
"st_subdivide": 10000 * cpuCostFactor,
"st_summary": 100 * cpuCostFactor,
"st_swapordinates": 100 * cpuCostFactor,
"st_symdifference": 10000 * cpuCostFactor,
"st_symmetricdifference": 10000 * cpuCostFactor,
"st_tileenvelope": 100 * cpuCostFactor,
"st_touches": 10000 * cpuCostFactor,
"st_transform": 100 * cpuCostFactor,
"st_translate": 100 * cpuCostFactor,
"st_transscale": 100 * cpuCostFactor,
"st_unaryunion": 10000 * cpuCostFactor,
"st_union": 10000 * cpuCostFactor,
"st_voronoilines": 100 * cpuCostFactor,
"st_voronoipolygons": 100 * cpuCostFactor,
"st_within": 10000 * cpuCostFactor,
"st_wkbtosql": 100 * cpuCostFactor,
"st_wkttosql": 1000 * cpuCostFactor,
}
// Init initializes a new coster structure with the given memo.
func (c *coster) Init(evalCtx *tree.EvalContext, mem *memo.Memo, perturbation float64) {
// This initialization pattern ensures that fields are not unwittingly
// reused. Field reuse must be explicit.
*c = coster{
evalCtx: evalCtx,
mem: mem,
locality: evalCtx.Locality,
perturbation: perturbation,
}
}
// ComputeCost calculates the estimated cost of the top-level operator in a
// candidate best expression, based on its logical properties and those of its
// children.
//
// Note: each custom function to compute the cost of an operator calculates
// the cost based on Big-O estimated complexity. Most constant factors are
// ignored for now.
func (c *coster) ComputeCost(candidate memo.RelExpr, required *physical.Required) memo.Cost {
var cost memo.Cost
switch candidate.Op() {
case opt.TopKOp:
cost = c.computeTopKCost(candidate.(*memo.TopKExpr), required)
case opt.SortOp:
cost = c.computeSortCost(candidate.(*memo.SortExpr), required)
case opt.ScanOp:
cost = c.computeScanCost(candidate.(*memo.ScanExpr), required)
case opt.SelectOp:
cost = c.computeSelectCost(candidate.(*memo.SelectExpr), required)
case opt.ProjectOp:
cost = c.computeProjectCost(candidate.(*memo.ProjectExpr))
case opt.InvertedFilterOp:
cost = c.computeInvertedFilterCost(candidate.(*memo.InvertedFilterExpr))
case opt.ValuesOp:
cost = c.computeValuesCost(candidate.(*memo.ValuesExpr))
case opt.InnerJoinOp, opt.LeftJoinOp, opt.RightJoinOp, opt.FullJoinOp,
opt.SemiJoinOp, opt.AntiJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinApplyOp,
opt.SemiJoinApplyOp, opt.AntiJoinApplyOp:
// All join ops use hash join by default.
cost = c.computeHashJoinCost(candidate)
case opt.MergeJoinOp:
cost = c.computeMergeJoinCost(candidate.(*memo.MergeJoinExpr))
case opt.IndexJoinOp:
cost = c.computeIndexJoinCost(candidate.(*memo.IndexJoinExpr), required)
case opt.LookupJoinOp:
cost = c.computeLookupJoinCost(candidate.(*memo.LookupJoinExpr), required)
case opt.InvertedJoinOp:
cost = c.computeInvertedJoinCost(candidate.(*memo.InvertedJoinExpr), required)
case opt.ZigzagJoinOp:
cost = c.computeZigzagJoinCost(candidate.(*memo.ZigzagJoinExpr))
case opt.UnionOp, opt.IntersectOp, opt.ExceptOp,
opt.UnionAllOp, opt.IntersectAllOp, opt.ExceptAllOp, opt.LocalityOptimizedSearchOp:
cost = c.computeSetCost(candidate)
case opt.GroupByOp, opt.ScalarGroupByOp, opt.DistinctOnOp, opt.EnsureDistinctOnOp,
opt.UpsertDistinctOnOp, opt.EnsureUpsertDistinctOnOp:
cost = c.computeGroupingCost(candidate, required)
case opt.LimitOp:
cost = c.computeLimitCost(candidate.(*memo.LimitExpr))
case opt.OffsetOp:
cost = c.computeOffsetCost(candidate.(*memo.OffsetExpr))
case opt.OrdinalityOp:
cost = c.computeOrdinalityCost(candidate.(*memo.OrdinalityExpr))
case opt.ProjectSetOp:
cost = c.computeProjectSetCost(candidate.(*memo.ProjectSetExpr))
case opt.ExplainOp:
// Technically, the cost of an Explain operation is independent of the cost
// of the underlying plan. However, we want to explain the plan we would get
// without EXPLAIN, i.e. the lowest cost plan. So do nothing special to get
// default behavior.
}
// Add a one-time cost for any operator, meant to reflect the cost of setting
// up execution for the operator. This makes plans with fewer operators
// preferable, all else being equal.
cost += cpuCostFactor
// Add a one-time cost for any operator with unbounded cardinality. This
// ensures we prefer plans that push limits as far down the tree as possible,
// all else being equal.
if candidate.Relational().Cardinality.IsUnbounded() {
cost += cpuCostFactor
}
if !cost.Less(memo.MaxCost) {
// Optsteps uses MaxCost to suppress nodes in the memo. When a node with
// MaxCost is added to the memo, it can lead to an obscure crash with an
// unknown node. We'd rather detect this early.
panic(errors.AssertionFailedf("node %s with MaxCost added to the memo", log.Safe(candidate.Op())))
}
if c.perturbation != 0 {
// Don't perturb the cost if we are forcing an index.
if cost < hugeCost {
// Get a random value in the range [-1.0, 1.0)
multiplier := 2*rand.Float64() - 1
// If perturbation is p, and the estimated cost of an expression is c,
// the new cost is in the range [max(0, c - pc), c + pc). For example,
// if p=1.5, the new cost is in the range [0, c + 1.5 * c).
cost += cost * memo.Cost(c.perturbation*multiplier)
// The cost must always be >= 0.
if cost < 0 {
cost = 0
}
}
}
return cost
}
func (c *coster) computeTopKCost(topk *memo.TopKExpr, required *physical.Required) memo.Cost {
rel := topk.Relational()
outputRowCount := rel.Stats.RowCount
inputRowCount := topk.Input.Relational().Stats.RowCount
if !required.Ordering.Any() {
// When there is a partial ordering of the input rows' sort columns, we may
// be able to reduce the number of input rows needed to find the top K rows.
inputRowCount = topKInputLimitHint(c.mem, topk, inputRowCount, outputRowCount, float64(topk.K))
}
// Add the cost of sorting.
// Start with a cost of storing each row; TopK sort only stores K rows in a
// max heap.
cost := memo.Cost(cpuCostFactor * float64(rel.OutputCols.Len()) * outputRowCount)
// Add buffering cost for the output rows.
cost += c.rowBufferCost(outputRowCount)
// In the worst case, there are O(N*log(K)) comparisons to compare each row in
// the input to the top of the max heap and sift the max heap if each row
// compared is in the top K found so far.
cost += c.rowCmpCost(len(topk.Ordering.Columns)) * memo.Cost((1+math.Log2(math.Max(outputRowCount, 1)))*inputRowCount)
// TODO(harding): Add the CPU cost of emitting the K output rows. This should
// be done in conjunction with computeSortCost.
return cost
}
func (c *coster) computeSortCost(sort *memo.SortExpr, required *physical.Required) memo.Cost {
// We calculate the cost of a (potentially) segmented sort.
//
// In a non-segmented sort, we have a single segment to sort according to
// required.Ordering.Columns.
//
// In a segmented sort, rows are split into segments according to
// InputOrdering.Columns; each segment is sorted according to the remaining
// columns from required.Ordering.Columns.
numKeyCols := len(required.Ordering.Columns)
numPreorderedCols := len(sort.InputOrdering.Columns)
rel := sort.Relational()
stats := rel.Stats
numSegments := c.countSegments(sort)
// Start with a cost of storing each row; this takes the total number of
// columns into account so that a sort on fewer columns is preferred (e.g.
// sort before projecting a new column).
cost := memo.Cost(cpuCostFactor * float64(rel.OutputCols.Len()) * stats.RowCount)
if !sort.InputOrdering.Any() {
// Add the cost for finding the segments: each row is compared to the
// previous row on the preordered columns. Most of these comparisons will
// yield equality, so we don't use rowCmpCost(): we expect to have to
// compare all preordered columns.
cost += cpuCostFactor * memo.Cost(numPreorderedCols) * memo.Cost(stats.RowCount)
}
// Add the cost to sort the segments. On average, each row is involved in
// O(log(segmentSize)) comparisons.
numCmpOpsPerRow := float64(1)
if segmentSize := stats.RowCount / numSegments; segmentSize > 1 {
numCmpOpsPerRow += math.Log2(segmentSize)
// Add a cost for buffering rows that takes into account increased memory
// pressure and the possibility of spilling to disk.
cost += memo.Cost(numSegments) * c.rowBufferCost(segmentSize)
}
cost += c.rowCmpCost(numKeyCols-numPreorderedCols) * memo.Cost(numCmpOpsPerRow*stats.RowCount)
// TODO(harding): Add the CPU cost of emitting the output rows. This should be
// done in conjunction with computeTopKCost.
return cost
}
func (c *coster) computeScanCost(scan *memo.ScanExpr, required *physical.Required) memo.Cost {
if scan.Flags.ForceIndex && scan.Flags.Index != scan.Index || scan.Flags.ForceZigzag {
// If we are forcing an index, any other index has a very high cost. In
// practice, this will only happen when this is a primary index scan.
return hugeCost
}
isUnfiltered := scan.IsUnfiltered(c.mem.Metadata())
if scan.Flags.NoFullScan {
// Normally a full scan of a partial index would be allowed with the
// NO_FULL_SCAN hint (isUnfiltered is false for partial indexes), but if the
// user has explicitly forced the partial index *and* used NO_FULL_SCAN, we
// disallow the full index scan.
if isUnfiltered || (scan.Flags.ForceIndex && scan.IsFullIndexScan(c.mem.Metadata())) {
return hugeCost
}
}
stats := scan.Relational().Stats
rowCount := stats.RowCount
if isUnfiltered && c.evalCtx != nil && c.evalCtx.SessionData().DisallowFullTableScans {
isLarge := !stats.Available || rowCount > c.evalCtx.SessionData().LargeFullScanRows
if isLarge {
return hugeCost
}
}
// Scanning an index with a few columns is faster than scanning an index with
// many columns. Ideally, we would want to use statistics about the size of
// each column. In lieu of that, use the number of columns.
perRowCost := c.rowScanCost(scan.Table, scan.Index, scan.Cols.Len())
numSpans := 1
if scan.Constraint != nil {
numSpans = scan.Constraint.Spans.Count()
} else if scan.InvertedConstraint != nil {
numSpans = len(scan.InvertedConstraint)
}
baseCost := memo.Cost(numSpans * randIOCostFactor)
// If this is a virtual scan, add the cost of fetching table descriptors.
if c.mem.Metadata().Table(scan.Table).IsVirtualTable() {
baseCost += virtualScanTableDescriptorFetchCost
}
// Performing a reverse scan is more expensive than a forward scan, but it's
// still preferable to sorting the output of a forward scan. To ensure we
// choose a reverse scan over a sort, add the reverse scan cost before we
// alter the row count for unbounded scan penalties below. This cost must also
// be added before adjusting the row count for the limit hint.
if ordering.ScanIsReverse(scan, &required.Ordering) {
if rowCount > 1 {
// Need to do binary search to seek to the previous row.
perRowCost += memo.Cost(math.Log2(rowCount)) * cpuCostFactor
}
}
// Add a penalty to full table scans. All else being equal, we prefer a
// constrained scan. Adding a few rows worth of cost helps prevent surprising
// plans for very small tables.
if isUnfiltered {
rowCount += fullScanRowCountPenalty
// For tables with multiple partitions, add the cost of visiting each
// partition.
// TODO(rytaft): In the future we should take latency into account here.
index := c.mem.Metadata().Table(scan.Table).Index(scan.Index)
if partitionCount := index.PartitionCount(); partitionCount > 1 {
// Subtract 1 since we already accounted for the first partition when
// counting spans.
baseCost += memo.Cost(partitionCount-1) * randIOCostFactor
}
}
// Add a penalty if the cardinality exceeds the row count estimate. Adding a
// few rows worth of cost helps prevent surprising plans for very small tables
// or for when stats are stale.
//
// Note: we add this to the baseCost rather than the rowCount, so that the
// number of index columns does not have an outsized effect on the cost of
// the scan. See issue #68556.
baseCost += c.largeCardinalityCostPenalty(scan.Relational().Cardinality, rowCount)
if required.LimitHint != 0 {
rowCount = math.Min(rowCount, required.LimitHint)
}
cost := baseCost + memo.Cost(rowCount)*(seqIOCostFactor+perRowCost)
// If this scan is locality optimized, divide the cost by 3 in order to make
// the total cost of the two scans in the locality optimized plan less than
// the cost of the single scan in the non-locality optimized plan.
// TODO(rytaft): This is hacky. We should really be making this determination
// based on the latency between regions.
if scan.LocalityOptimized {
cost /= 3
}
return cost
}
func (c *coster) computeSelectCost(sel *memo.SelectExpr, required *physical.Required) memo.Cost {
// Typically the filter has to be evaluated on each input row.
inputRowCount := sel.Input.Relational().Stats.RowCount
// If there is a LimitHint, n, it is expected that the filter will only be
// evaluated on the number of rows required to produce n rows.
if required.LimitHint != 0 {
selectivity := sel.Relational().Stats.Selectivity.AsFloat()
inputRowCount = math.Min(inputRowCount, required.LimitHint/selectivity)
}
filterSetup, filterPerRow := c.computeFiltersCost(sel.Filters, util.FastIntMap{})
cost := memo.Cost(inputRowCount) * filterPerRow
cost += filterSetup
return cost
}
func (c *coster) computeProjectCost(prj *memo.ProjectExpr) memo.Cost {
// Each synthesized column causes an expression to be evaluated on each row.
rowCount := prj.Relational().Stats.RowCount
synthesizedColCount := len(prj.Projections)
cost := memo.Cost(rowCount) * memo.Cost(synthesizedColCount) * cpuCostFactor
// Add the CPU cost of emitting the rows.
cost += memo.Cost(rowCount) * cpuCostFactor
return cost
}
func (c *coster) computeInvertedFilterCost(invFilter *memo.InvertedFilterExpr) memo.Cost {
// The filter has to be evaluated on each input row.
inputRowCount := invFilter.Input.Relational().Stats.RowCount
cost := memo.Cost(inputRowCount) * cpuCostFactor
return cost
}
func (c *coster) computeValuesCost(values *memo.ValuesExpr) memo.Cost {
return memo.Cost(values.Relational().Stats.RowCount) * cpuCostFactor
}
func (c *coster) computeHashJoinCost(join memo.RelExpr) memo.Cost {
if join.Private().(*memo.JoinPrivate).Flags.Has(memo.DisallowHashJoinStoreRight) {
return hugeCost
}
leftRowCount := join.Child(0).(memo.RelExpr).Relational().Stats.RowCount
rightRowCount := join.Child(1).(memo.RelExpr).Relational().Stats.RowCount
if (join.Op() == opt.SemiJoinOp || join.Op() == opt.AntiJoinOp) && leftRowCount < rightRowCount {
// If we have a semi or an anti join, during the execbuilding we choose
// the relation with smaller cardinality to be on the right side, so we
// need to swap row counts accordingly.
// TODO(raduberinde): we might also need to look at memo.JoinFlags when
// choosing a side.
leftRowCount, rightRowCount = rightRowCount, leftRowCount
}
// A hash join must process every row from both tables once.
//
// We add some factors to account for the hashtable build and lookups. The
// right side is the one stored in the hashtable, so we use a larger factor
// for that side. This ensures that a join with the smaller right side is
// preferred to the symmetric join.
cost := memo.Cost(1.25*leftRowCount+1.75*rightRowCount) * cpuCostFactor
// Add a cost for buffering rows that takes into account increased memory
// pressure and the possibility of spilling to disk.
cost += c.rowBufferCost(rightRowCount)
// Compute filter cost. Fetch the equality columns so they can be
// ignored later.
on := join.Child(2).(*memo.FiltersExpr)
leftEq, rightEq := memo.ExtractJoinEqualityColumns(
join.Child(0).(memo.RelExpr).Relational().OutputCols,
join.Child(1).(memo.RelExpr).Relational().OutputCols,
*on,
)
// Generate a quick way to lookup if two columns are join equality
// columns. We add in both directions because we don't know which way
// the equality filters will be defined.
eqMap := util.FastIntMap{}
for i := range leftEq {
left := int(leftEq[i])
right := int(rightEq[i])
eqMap.Set(left, right)
eqMap.Set(right, left)
}
filterSetup, filterPerRow := c.computeFiltersCost(*on, eqMap)
cost += filterSetup
// Add the CPU cost of emitting the rows.
rowsProcessed, ok := c.mem.RowsProcessed(join)
if !ok {
// This can happen as part of testing. In this case just return the number
// of rows.
rowsProcessed = join.Relational().Stats.RowCount
}
cost += memo.Cost(rowsProcessed) * filterPerRow
return cost
}
func (c *coster) computeMergeJoinCost(join *memo.MergeJoinExpr) memo.Cost {
if join.MergeJoinPrivate.Flags.Has(memo.DisallowMergeJoin) {
return hugeCost
}
leftRowCount := join.Left.Relational().Stats.RowCount
rightRowCount := join.Right.Relational().Stats.RowCount
if (join.Op() == opt.SemiJoinOp || join.Op() == opt.AntiJoinOp) && leftRowCount < rightRowCount {
// If we have a semi or an anti join, during the execbuilding we choose
// the relation with smaller cardinality to be on the right side, so we
// need to swap row counts accordingly.
// TODO(raduberinde): we might also need to look at memo.JoinFlags when
// choosing a side.
leftRowCount, rightRowCount = rightRowCount, leftRowCount
}
// The vectorized merge join in some cases buffers rows from the right side
// whereas the left side is processed in a streaming fashion. To account for
// this difference, we multiply both row counts so that a join with the
// smaller right side is preferred to the symmetric join.
cost := memo.Cost(0.9*leftRowCount+1.1*rightRowCount) * cpuCostFactor
filterSetup, filterPerRow := c.computeFiltersCost(join.On, util.FastIntMap{})
cost += filterSetup
// Add the CPU cost of emitting the rows.
rowsProcessed, ok := c.mem.RowsProcessed(join)
if !ok {
// We shouldn't ever get here. Since we don't allow the memo
// to be optimized twice, the coster should never be used after
// logPropsBuilder.clear() is called.
panic(errors.AssertionFailedf("could not get rows processed for merge join"))
}
cost += memo.Cost(rowsProcessed) * filterPerRow
return cost
}
func (c *coster) computeIndexJoinCost(
join *memo.IndexJoinExpr, required *physical.Required,
) memo.Cost {
return c.computeIndexLookupJoinCost(
join,
required,
true, /* lookupColsAreTableKey */
memo.TrueFilter,
join.Cols,
join.Table,
cat.PrimaryIndex,
memo.JoinFlags(0),
false, /* localityOptimized */
)
}
func (c *coster) computeLookupJoinCost(
join *memo.LookupJoinExpr, required *physical.Required,
) memo.Cost {
if join.LookupJoinPrivate.Flags.Has(memo.DisallowLookupJoinIntoRight) {
return hugeCost
}
return c.computeIndexLookupJoinCost(
join,
required,
join.LookupColsAreTableKey,
join.On,
join.Cols,
join.Table,
join.Index,
join.Flags,
join.LocalityOptimized,
)
}
func (c *coster) computeIndexLookupJoinCost(
join memo.RelExpr,
required *physical.Required,
lookupColsAreTableKey bool,
on memo.FiltersExpr,
cols opt.ColSet,
table opt.TableID,
index cat.IndexOrdinal,
flags memo.JoinFlags,
localityOptimized bool,
) memo.Cost {
input := join.Child(0).(memo.RelExpr)
lookupCount := input.Relational().Stats.RowCount
// Take into account that the "internal" row count is higher, according to
// the selectivities of the conditions. In particular, we need to ignore
// left-over conditions that are not selective.
// For example:
// ab JOIN xy ON a=x AND x=10
// becomes (during normalization):
// ab JOIN xy ON a=x AND a=10 AND x=10
// which can become a lookup join with left-over condition x=10 which doesn't
// actually filter anything.
rowsProcessed, ok := c.mem.RowsProcessed(join)
if !ok {
// We shouldn't ever get here. Since we don't allow the memo
// to be optimized twice, the coster should never be used after
// logPropsBuilder.clear() is called.
panic(errors.AssertionFailedf("could not get rows processed for lookup join"))
}
// Lookup joins can return early if enough rows have been found. An otherwise
// expensive lookup join might have a lower cost if its limit hint estimates
// that most rows will not be needed.
if required.LimitHint != 0 && lookupCount > 0 {
outputRows := join.Relational().Stats.RowCount
unlimitedLookupCount := lookupCount
lookupCount = lookupJoinInputLimitHint(unlimitedLookupCount, outputRows, required.LimitHint)
// We scale the number of rows processed by the same factor (we are
// calculating the average number of rows processed per lookup and
// multiplying by the new lookup count).
rowsProcessed = (rowsProcessed / unlimitedLookupCount) * lookupCount
}
// The rows in the (left) input are used to probe into the (right) table.
// Since the matching rows in the table may not all be in the same range, this
// counts as random I/O.
perLookupCost := memo.Cost(randIOCostFactor)
if !lookupColsAreTableKey {
// If the lookup columns don't form a key, execution will have to limit
// KV batches which prevents running requests to multiple nodes in parallel.
// An experiment on a 4 node cluster with a table with 100k rows split into
// 100 ranges showed that a "non-parallel" lookup join is about 5 times
// slower.
perLookupCost *= 5
}
if c.mem.Metadata().Table(table).IsVirtualTable() {
// It's expensive to perform a lookup join into a virtual table because
// we need to fetch the table descriptors on each lookup.
perLookupCost += virtualScanTableDescriptorFetchCost
}
perLookupCost += lookupExprCost(join)
cost := memo.Cost(lookupCount) * perLookupCost
filterSetup, filterPerRow := c.computeFiltersCost(on, util.FastIntMap{})
cost += filterSetup
// Each lookup might retrieve many rows; add the IO cost of retrieving the
// rows (relevant when we expect many resulting rows per lookup) and the CPU
// cost of emitting the rows.
numLookupCols := cols.Difference(input.Relational().OutputCols).Len()
perRowCost := lookupJoinRetrieveRowCost + filterPerRow +
c.rowScanCost(table, index, numLookupCols)
cost += memo.Cost(rowsProcessed) * perRowCost
if flags.Has(memo.PreferLookupJoinIntoRight) {
// If we prefer a lookup join, make the cost much smaller.
cost *= preferLookupJoinFactor
}
// If this lookup join is locality optimized, divide the cost by 2.5 in order to make
// the total cost of the two lookup joins in the locality optimized plan less than
// the cost of the single lookup join in the non-locality optimized plan.
// TODO(rytaft): This is hacky. We should really be making this determination