diff --git a/pkg/ccl/logictestccl/testdata/logic_test/zone b/pkg/ccl/logictestccl/testdata/logic_test/zone new file mode 100644 index 000000000000..e639e7a89eec --- /dev/null +++ b/pkg/ccl/logictestccl/testdata/logic_test/zone @@ -0,0 +1,75 @@ +# LogicTest: 5node-dist-opt + +# Ensure that cost-based-optimizer uses an index with zone constraints that most +# closely matches the gateway's locality. + +statement ok +CREATE TABLE t ( + k INT PRIMARY KEY, + v STRING, + INDEX secondary (k) STORING (v) +); + +# ------------------------------------------------------------------------------ +# Put table in dc2 and secondary index in dc1 so that the gateway matches the +# secondary index rather the primary index. +# ------------------------------------------------------------------------------ + +statement ok +ALTER TABLE t CONFIGURE ZONE USING constraints='[+region=test,+dc=dc2]' + +statement ok +ALTER INDEX t@secondary CONFIGURE ZONE USING constraints='[+region=test,+dc=dc1]' + +query TTT +EXPLAIN SELECT * FROM t WHERE k=10 +---- +scan · · +· table t@secondary +· spans /10-/11 + +# ------------------------------------------------------------------------------ +# Swap location of primary and secondary indexes and ensure that primary index +# is used instead. +# ------------------------------------------------------------------------------ + +statement ok +ALTER TABLE t CONFIGURE ZONE USING constraints='[+region=test,+dc=dc1]' + +statement ok +ALTER INDEX t@secondary CONFIGURE ZONE USING constraints='[+region=test,+dc=dc2]' + +query TTT +EXPLAIN SELECT * FROM t WHERE k=10 +---- +scan · · +· table t@primary +· spans /10-/10/# + +# ------------------------------------------------------------------------------ +# Use PREPARE to make sure that the prepared plan is invalidated when the +# secondary index's constraints change. +# ------------------------------------------------------------------------------ + +statement +PREPARE p AS SELECT tree, field, description FROM [EXPLAIN SELECT k, v FROM t WHERE k=10] + +query TTT +EXECUTE p +---- +scan · · +· table t@primary +· spans /10-/10/# + +statement ok +ALTER TABLE t CONFIGURE ZONE USING constraints='[+region=test,+dc=dc2]' + +statement ok +ALTER INDEX t@secondary CONFIGURE ZONE USING constraints='[+region=test,+dc=dc1]' + +query TTT +EXECUTE p +---- +scan · · +· table t@secondary +· spans /10-/11 diff --git a/pkg/config/zone.go b/pkg/config/zone.go index 03ccede83a27..f87ec352e3b8 100644 --- a/pkg/config/zone.go +++ b/pkg/config/zone.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" "github.com/cockroachdb/cockroach/pkg/sql/parser" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" @@ -726,3 +727,43 @@ func (z ZoneConfig) subzoneSplits() []roachpb.RKey { } return out } + +// ReplicaConstraintsCount is part of the cat.Zone interface. +func (z *ZoneConfig) ReplicaConstraintsCount() int { + return len(z.Constraints) +} + +// ReplicaConstraints is part of the cat.Zone interface. +func (z *ZoneConfig) ReplicaConstraints(i int) cat.ReplicaConstraints { + return &z.Constraints[i] +} + +// ReplicaCount is part of the cat.ReplicaConstraints interface. +func (c *Constraints) ReplicaCount() int32 { + return c.NumReplicas +} + +// ConstraintCount is part of the cat.ReplicaConstraints interface. +func (c *Constraints) ConstraintCount() int { + return len(c.Constraints) +} + +// Constraint is part of the cat.ReplicaConstraints interface. +func (c *Constraints) Constraint(i int) cat.Constraint { + return &c.Constraints[i] +} + +// IsRequired is part of the cat.Constraint interface. +func (c *Constraint) IsRequired() bool { + return c.Type == Constraint_REQUIRED +} + +// GetKey is part of the cat.Constraint interface. +func (c *Constraint) GetKey() string { + return c.Key +} + +// GetValue is part of the cat.Constraint interface. +func (c *Constraint) GetValue() string { + return c.Value +} diff --git a/pkg/server/server.go b/pkg/server/server.go index 7b6b2285b121..3db1ea96b5b8 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -600,6 +600,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) { execCfg = sql.ExecutorConfig{ Settings: s.st, NodeInfo: nodeInfo, + Locality: s.cfg.Locality, AmbientCtx: s.cfg.AmbientCtx, DB: s.db, Gossip: s.gossip, diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 2d01a205d397..dae1a9e20a68 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -1814,6 +1814,7 @@ func (ex *connExecutor) initEvalCtx(ctx context.Context, evalCtx *extendedEvalCo TestingKnobs: ex.server.cfg.EvalContextTestingKnobs, ClusterID: ex.server.cfg.ClusterID(), NodeID: ex.server.cfg.NodeID.Get(), + Locality: ex.server.cfg.Locality, ReCache: ex.server.reCache, InternalExecutor: ie, }, diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index e8644528477c..cd276ec6f03b 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -331,6 +331,7 @@ type nodeStatusGenerator interface { type ExecutorConfig struct { Settings *cluster.Settings NodeInfo + Locality roachpb.Locality AmbientCtx log.AmbientContext DB *client.DB Gossip *gossip.Gossip diff --git a/pkg/sql/logictest/testdata/logic_test/crdb_internal b/pkg/sql/logictest/testdata/logic_test/crdb_internal index d829f55228a6..2d82a91866c3 100644 --- a/pkg/sql/logictest/testdata/logic_test/crdb_internal +++ b/pkg/sql/logictest/testdata/logic_test/crdb_internal @@ -290,8 +290,8 @@ node_id component field value query ITTTTT colnames SELECT node_id, network, regexp_replace(address, '\d+$', '') as address, attrs, locality, regexp_replace(server_version, '^\d+\.\d+(-\d+)?$', '') as server_version FROM crdb_internal.gossip_nodes WHERE node_id = 1 ---- -node_id network address attrs locality server_version -1 tcp 127.0.0.1: [] {"region": "test"} +node_id network address attrs locality server_version +1 tcp 127.0.0.1: [] {"dc": "dc1", "region": "test"} query IITBB colnames SELECT node_id, epoch, regexp_replace(expiration, '^\d+\.\d+,\d+$', '') as expiration, draining, decommissioning FROM crdb_internal.gossip_liveness WHERE node_id = 1 @@ -303,8 +303,8 @@ query ITTTTTT colnames SELECT node_id, network, regexp_replace(address, '\d+$', '') as address, attrs, locality, regexp_replace(server_version, '^\d+\.\d+(-\d+)?$', '') as server_version, regexp_replace(go_version, '^go.+$', '') as go_version FROM crdb_internal.kv_node_status WHERE node_id = 1 ---- -node_id network address attrs locality server_version go_version -1 tcp 127.0.0.1: [] {"region": "test"} +node_id network address attrs locality server_version go_version +1 tcp 127.0.0.1: [] {"dc": "dc1", "region": "test"} query IITI colnames SELECT node_id, store_id, attrs, used diff --git a/pkg/sql/opt/cat/index.go b/pkg/sql/opt/cat/index.go index a9d7d31eb69e..2fb2bc37549d 100644 --- a/pkg/sql/opt/cat/index.go +++ b/pkg/sql/opt/cat/index.go @@ -117,6 +117,16 @@ type Index interface { // of an outbound foreign key relation. Returns false for the second // return value if there is no foreign key reference on this index. ForeignKey() (ForeignKeyReference, bool) + + // Zone returns the zone which constrains placement of the index's range + // replicas. If the index was not explicitly assigned to a zone, then it + // inherits the zone of its owning table (which in turn inherits from its + // owning database or the default zone). In addition, any unspecified zone + // information will also be inherited. + // + // NOTE: This zone always applies to the entire index and never to any + // partifular partition of the index. + Zone() Zone } // IndexColumn describes a single column that is part of an index definition. diff --git a/pkg/sql/opt/cat/sequence.go b/pkg/sql/opt/cat/sequence.go index 49f86e01eb24..d705bf05a270 100644 --- a/pkg/sql/opt/cat/sequence.go +++ b/pkg/sql/opt/cat/sequence.go @@ -30,8 +30,8 @@ type Sequence interface { SequenceName() *tree.TableName } -// FormatCatalogSequence nicely formats a catalog sequence using a treeprinter for +// FormatSequence nicely formats a catalog sequence using a treeprinter for // debugging and testing. -func FormatCatalogSequence(cat Catalog, seq Sequence, tp treeprinter.Node) { +func FormatSequence(cat Catalog, seq Sequence, tp treeprinter.Node) { tp.Childf("SEQUENCE %s", seq.Name()) } diff --git a/pkg/sql/opt/cat/table.go b/pkg/sql/opt/cat/table.go index dda8796e3d29..f6feb2bf0f43 100644 --- a/pkg/sql/opt/cat/table.go +++ b/pkg/sql/opt/cat/table.go @@ -202,9 +202,9 @@ func FindTableColumnByName(tab Table, name tree.Name) int { return -1 } -// FormatCatalogTable nicely formats a catalog table using a treeprinter for -// debugging and testing. -func FormatCatalogTable(cat Catalog, tab Table, tp treeprinter.Node) { +// FormatTable nicely formats a catalog table using a treeprinter for debugging +// and testing. +func FormatTable(cat Catalog, tab Table, tp treeprinter.Node) { child := tp.Childf("TABLE %s", tab.Name().TableName) var buf bytes.Buffer diff --git a/pkg/sql/opt/cat/view.go b/pkg/sql/opt/cat/view.go index 9cc2deeae48c..23115bf3bc6e 100644 --- a/pkg/sql/opt/cat/view.go +++ b/pkg/sql/opt/cat/view.go @@ -40,9 +40,9 @@ type View interface { ColumnName(i int) tree.Name } -// FormatCatalogView nicely formats a catalog view using a treeprinter for -// debugging and testing. -func FormatCatalogView(view View, tp treeprinter.Node) { +// FormatView nicely formats a catalog view using a treeprinter for debugging +// and testing. +func FormatView(view View, tp treeprinter.Node) { var buf bytes.Buffer if view.ColumnNameCount() > 0 { buf.WriteString(" (") diff --git a/pkg/sql/opt/cat/zone.go b/pkg/sql/opt/cat/zone.go new file mode 100644 index 000000000000..7c85971197d4 --- /dev/null +++ b/pkg/sql/opt/cat/zone.go @@ -0,0 +1,116 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package cat + +import ( + "bytes" + "fmt" + + "github.com/cockroachdb/cockroach/pkg/util/treeprinter" +) + +// Zone is an interface to zone configuration information used by the optimizer. +// The optimizer prefers indexes with constraints that best match the locality +// of the gateway node that plans the query. +type Zone interface { + // ReplicaConstraintsCount returns the number of replica constraint sets that + // are part of this zone. + ReplicaConstraintsCount() int + + // ReplicaConstraints returns the ith set of replica constraints in the zone, + // where i < ReplicaConstraintsCount. + ReplicaConstraints(i int) ReplicaConstraints +} + +// ReplicaConstraints is a set of constraints that apply to one or more replicas +// of a range, restricting which nodes can host that range. For example, if a +// table range has three replicas, then two of the replicas might be pinned to +// nodes in one region, whereas the third might be pinned to another region. +type ReplicaConstraints interface { + // ReplicaCount returns the number of replicas that should abide by this set + // of constraints. If 0, then the constraints apply to all replicas of the + // range (and there can be only one ReplicaConstraints in the Zone). + ReplicaCount() int32 + + // ConstraintCount returns the number of constraints in the set. + ConstraintCount() int + + // Constraint returns the ith constraint in the set, where + // i < ConstraintCount. + Constraint(i int) Constraint +} + +// Constraint governs placement of range replicas on nodes. A constraint can +// either be required or prohibited. A required constraint's key/value pair must +// match one of the tiers of a node's locality for the range to locate there. +// A prohibited constraint's key/value pair must *not* match any of the tiers of +// a node's locality for the range to locate there. For example: +// +// +region=east Range can only be placed on nodes in region=east locality. +// -region=west Range cannot be placed on nodes in region=west locality. +// +type Constraint interface { + // IsRequired is true if this is a required constraint, or false if this is + // a prohibited constraint (signified by initial + or - character). + IsRequired() bool + + // GetKey returns the constraint's string key (to left of =). + GetKey() string + + // GetValue returns the constraint's string value (to right of =). + GetValue() string +} + +// FormatZone nicely formats a catalog zone using a treeprinter for debugging +// and testing. +func FormatZone(zone Zone, tp treeprinter.Node) { + child := tp.Childf("ZONE") + if zone.ReplicaConstraintsCount() > 1 { + child = child.Childf("replica constraints") + } + for i, n := 0, zone.ReplicaConstraintsCount(); i < n; i++ { + replConstraint := zone.ReplicaConstraints(i) + constraintStr := formatReplicaConstraint(replConstraint) + if zone.ReplicaConstraintsCount() > 1 { + numReplicas := replConstraint.ReplicaCount() + child.Childf("%d replicas: %s", numReplicas, constraintStr) + } else { + child.Childf("constraints: %s", constraintStr) + } + } +} + +func formatReplicaConstraint(replConstraint ReplicaConstraints) string { + var buf bytes.Buffer + buf.WriteRune('[') + for i, n := 0, replConstraint.ConstraintCount(); i < n; i++ { + constraint := replConstraint.Constraint(i) + if i != 0 { + buf.WriteRune(',') + } + if constraint.IsRequired() { + buf.WriteRune('+') + } else { + buf.WriteRune('-') + } + if constraint.GetKey() != "" { + fmt.Fprintf(&buf, "%s=%s", constraint.GetKey(), constraint.GetValue()) + } else { + buf.WriteString(constraint.GetValue()) + } + } + buf.WriteRune(']') + return buf.String() +} diff --git a/pkg/sql/opt/exec/execbuilder/testdata/distsql_agg b/pkg/sql/opt/exec/execbuilder/testdata/distsql_agg index 4a875cff898b..5f0f452e0235 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/distsql_agg +++ b/pkg/sql/opt/exec/execbuilder/testdata/distsql_agg @@ -337,7 +337,7 @@ group-by ├── grouping columns: b:2 ├── internal-ordering: +2 opt(1) ├── stats: [rows=9.5617925, distinct(2)=9.5617925, null(2)=0] - ├── cost: 10.7156179 + ├── cost: 11.1156179 ├── key: (2) ├── fd: (2)-->(3) ├── prune: (3) @@ -345,7 +345,7 @@ group-by │ ├── columns: a:1 b:2 │ ├── constraint: /1/2: [/1 - /1] │ ├── stats: [rows=10, distinct(1)=1, null(1)=0, distinct(2)=9.5617925, null(2)=0] - │ ├── cost: 10.41 + │ ├── cost: 10.81 │ ├── key: (2) │ ├── fd: ()-->(1) │ ├── ordering: +2 opt(1) [actual: +2] diff --git a/pkg/sql/opt/exec/execbuilder/testdata/explain b/pkg/sql/opt/exec/execbuilder/testdata/explain index 7df851660c84..60a797aa7460 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/explain +++ b/pkg/sql/opt/exec/execbuilder/testdata/explain @@ -647,21 +647,21 @@ EXPLAIN (OPT,VERBOSE) SELECT * FROM tc WHERE a = 10 ORDER BY b sort ├── columns: a:1 b:2 ├── stats: [rows=9.9, distinct(1)=1, null(1)=0] - ├── cost: 51.3728708 + ├── cost: 52.2638708 ├── fd: ()-->(1) ├── ordering: +2 opt(1) [actual: +2] ├── prune: (2) └── index-join tc ├── columns: a:1 b:2 ├── stats: [rows=9.9, distinct(1)=1, null(1)=0] - ├── cost: 50.51 + ├── cost: 51.401 ├── fd: ()-->(1) ├── prune: (2) └── scan tc@c ├── columns: a:1 rowid:3 ├── constraint: /1/3: [/10 - /10] ├── stats: [rows=9.9, distinct(1)=1, null(1)=0, distinct(3)=9.9, null(3)=0] - ├── cost: 10.306 + ├── cost: 10.702 ├── key: (3) └── fd: ()-->(1) @@ -671,21 +671,21 @@ EXPLAIN (OPT,TYPES) SELECT * FROM tc WHERE a = 10 ORDER BY b sort ├── columns: a:1(int!null) b:2(int) ├── stats: [rows=9.9, distinct(1)=1, null(1)=0] - ├── cost: 51.3728708 + ├── cost: 52.2638708 ├── fd: ()-->(1) ├── ordering: +2 opt(1) [actual: +2] ├── prune: (2) └── index-join tc ├── columns: a:1(int!null) b:2(int) ├── stats: [rows=9.9, distinct(1)=1, null(1)=0] - ├── cost: 50.51 + ├── cost: 51.401 ├── fd: ()-->(1) ├── prune: (2) └── scan tc@c ├── columns: a:1(int!null) rowid:3(int!null) ├── constraint: /1/3: [/10 - /10] ├── stats: [rows=9.9, distinct(1)=1, null(1)=0, distinct(3)=9.9, null(3)=0] - ├── cost: 10.306 + ├── cost: 10.702 ├── key: (3) └── fd: ()-->(1) @@ -707,24 +707,24 @@ EXPLAIN (OPT, VERBOSE) SELECT * FROM tc WHERE a + 2 * b > 1 ORDER BY a*b sort ├── columns: a:1 b:2 [hidden: column4:4] ├── stats: [rows=333.333333] - ├── cost: 1129.24548 + ├── cost: 1179.24548 ├── fd: (1,2)-->(4) ├── ordering: +4 ├── prune: (1,2,4) └── project ├── columns: column4:4 a:1 b:2 ├── stats: [rows=333.333333] - ├── cost: 1066.69667 + ├── cost: 1116.69667 ├── fd: (1,2)-->(4) ├── prune: (1,2,4) ├── select │ ├── columns: a:1 b:2 │ ├── stats: [rows=333.333333] - │ ├── cost: 1060.02 + │ ├── cost: 1110.02 │ ├── scan tc │ │ ├── columns: a:1 b:2 │ │ ├── stats: [rows=1000] - │ │ ├── cost: 1050.01 + │ │ ├── cost: 1100.01 │ │ └── prune: (1,2) │ └── filters │ └── (a + (b * 2)) > 1 [outer=(1,2)] @@ -737,24 +737,24 @@ EXPLAIN (OPT, TYPES) SELECT * FROM tc WHERE a + 2 * b > 1 ORDER BY a*b sort ├── columns: a:1(int) b:2(int) [hidden: column4:4(int)] ├── stats: [rows=333.333333] - ├── cost: 1129.24548 + ├── cost: 1179.24548 ├── fd: (1,2)-->(4) ├── ordering: +4 ├── prune: (1,2,4) └── project ├── columns: column4:4(int) a:1(int) b:2(int) ├── stats: [rows=333.333333] - ├── cost: 1066.69667 + ├── cost: 1116.69667 ├── fd: (1,2)-->(4) ├── prune: (1,2,4) ├── select │ ├── columns: a:1(int) b:2(int) │ ├── stats: [rows=333.333333] - │ ├── cost: 1060.02 + │ ├── cost: 1110.02 │ ├── scan tc │ │ ├── columns: a:1(int) b:2(int) │ │ ├── stats: [rows=1000] - │ │ ├── cost: 1050.01 + │ │ ├── cost: 1100.01 │ │ └── prune: (1,2) │ └── filters │ └── gt [type=bool, outer=(1,2)] diff --git a/pkg/sql/opt/testutils/opttester/opt_tester.go b/pkg/sql/opt/testutils/opttester/opt_tester.go index 33c93a759855..e6a56a1902e5 100644 --- a/pkg/sql/opt/testutils/opttester/opt_tester.go +++ b/pkg/sql/opt/testutils/opttester/opt_tester.go @@ -24,6 +24,7 @@ import ( "testing" "text/tabwriter" + "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/opt" "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" @@ -122,6 +123,15 @@ type Flags struct { // ReorderJoinsLimit is the maximum number of joins in a query which the optimizer // should attempt to reorder. JoinLimit int + + // Locality specifies the location of the planning node as a set of user- + // defined key/value pairs, ordered from most inclusive to least inclusive. + // If there are no tiers, then the node's location is not known. Examples: + // + // [region=eu] + // [region=us,dc=east] + // + Locality roachpb.Locality } // New constructs a new instance of the OptTester for the given SQL statement. @@ -223,6 +233,10 @@ func New(catalog cat.Catalog, sql string) *OptTester { // expression in the query tree for the purpose of creating alternate query // plans in the optimizer. // +// - locality: used to set the locality of the node that plans the query. This +// can affect costing when there are multiple possible indexes to choose +// from, each in different localities. +// func (ot *OptTester) RunCommand(tb testing.TB, d *datadriven.TestData) string { // Allow testcases to override the flags. for _, a := range d.CmdArgs { @@ -240,6 +254,7 @@ func (ot *OptTester) RunCommand(tb testing.TB, d *datadriven.TestData) string { ot.Flags.Verbose = testing.Verbose() ot.evalCtx.TestingKnobs.OptimizerCostPerturbation = ot.Flags.PerturbCost + ot.evalCtx.Locality = ot.Flags.Locality switch d.Cmd { case "exec-ddl": @@ -509,6 +524,14 @@ func (f *Flags) Set(arg datadriven.CmdArg) error { return err } + case "locality": + // Recombine multiple arguments, separated by commas. + locality := strings.Join(arg.Vals, ",") + err := f.Locality.Set(locality) + if err != nil { + return err + } + default: return fmt.Errorf("unknown argument: %s", arg.Key) } diff --git a/pkg/sql/opt/testutils/testcat/create_table.go b/pkg/sql/opt/testutils/testcat/create_table.go index f0610b54d0e8..efbc6d65b172 100644 --- a/pkg/sql/opt/testutils/testcat/create_table.go +++ b/pkg/sql/opt/testutils/testcat/create_table.go @@ -18,6 +18,7 @@ import ( "fmt" "strings" + "github.com/cockroachdb/cockroach/pkg/config" "github.com/cockroachdb/cockroach/pkg/sql/coltypes" "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" @@ -330,6 +331,7 @@ func (tt *Table) addIndex(def *tree.IndexTableDef, typ indexType) *Index { IdxName: tt.makeIndexName(def.Name, typ), Unique: typ != nonUniqueIndex, Inverted: def.Inverted, + IdxZone: &config.ZoneConfig{}, table: tt, } diff --git a/pkg/sql/opt/testutils/testcat/set_zone_config.go b/pkg/sql/opt/testutils/testcat/set_zone_config.go new file mode 100644 index 000000000000..acc78825272d --- /dev/null +++ b/pkg/sql/opt/testutils/testcat/set_zone_config.go @@ -0,0 +1,56 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package testcat + +import ( + "fmt" + + "github.com/cockroachdb/cockroach/pkg/config" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "gopkg.in/yaml.v2" +) + +// SetZoneConfig is a partial implementation of the ALTER TABLE ... CONFIGURE +// ZONE USING statement. +func (tc *Catalog) SetZoneConfig(stmt *tree.SetZoneConfig) *config.ZoneConfig { + // Update the table name to include catalog and schema if not provided. + tabName := stmt.TableOrIndex.Table + tc.qualifyTableName(&tabName) + tab := tc.Table(&tabName) + + for _, idx := range tab.Indexes { + if idx.IdxName == string(stmt.TableOrIndex.Index) { + idx.IdxZone = makeZoneConfig(stmt.Options) + return idx.IdxZone + } + } + panic(fmt.Errorf("\"%q\" is not an index", stmt.TableOrIndex.Index)) +} + +// makeZoneConfig constructs a ZoneConfig from options provided to the CONFIGURE +// ZONE USING statement. +func makeZoneConfig(options tree.KVOptions) *config.ZoneConfig { + for i := range options { + if options[i].Key == "constraints" { + constraintsList := &config.ConstraintsList{} + value := options[i].Value.(*tree.StrVal).RawString() + if err := yaml.UnmarshalStrict([]byte(value), constraintsList); err != nil { + panic(err) + } + return &config.ZoneConfig{Constraints: constraintsList.Constraints} + } + } + return &config.ZoneConfig{} +} diff --git a/pkg/sql/opt/testutils/testcat/test_catalog.go b/pkg/sql/opt/testutils/testcat/test_catalog.go index 0dd3f3b29eb6..55eb96d5b83c 100644 --- a/pkg/sql/opt/testutils/testcat/test_catalog.go +++ b/pkg/sql/opt/testutils/testcat/test_catalog.go @@ -19,6 +19,7 @@ import ( "fmt" "time" + "github.com/cockroachdb/cockroach/pkg/config" "github.com/cockroachdb/cockroach/pkg/sql/coltypes" "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" "github.com/cockroachdb/cockroach/pkg/sql/parser" @@ -277,8 +278,10 @@ func (tc *Catalog) ExecuteDDL(sql string) (string, error) { return "", err } - if stmt.AST.StatementType() != tree.DDL { - return "", fmt.Errorf("statement type is not DDL: %v", stmt.AST.StatementType()) + switch stmt.AST.StatementType() { + case tree.DDL, tree.RowsAffected: + default: + return "", fmt.Errorf("statement type is not DDL or RowsAffected: %v", stmt.AST.StatementType()) } switch stmt := stmt.AST.(type) { @@ -302,6 +305,12 @@ func (tc *Catalog) ExecuteDDL(sql string) (string, error) { seq := tc.CreateSequence(stmt) return seq.String(), nil + case *tree.SetZoneConfig: + zone := tc.SetZoneConfig(stmt) + tp := treeprinter.New() + cat.FormatZone(zone, tp) + return tp.String(), nil + default: return "", fmt.Errorf("unsupported statement: %v", stmt) } @@ -391,7 +400,7 @@ var _ cat.View = &View{} func (tv *View) String() string { tp := treeprinter.New() - cat.FormatCatalogView(tv, tp) + cat.FormatView(tv, tp) return tp.String() } @@ -463,7 +472,7 @@ var _ cat.Table = &Table{} func (tt *Table) String() string { tp := treeprinter.New() - cat.FormatCatalogTable(tt.Catalog, tt, tp) + cat.FormatTable(tt.Catalog, tt, tp) return tp.String() } @@ -609,6 +618,10 @@ type Index struct { Columns []cat.IndexColumn + // IdxZone is the zone associated with the index. This may be inherited from + // the parent table, database, or even the default zone. + IdxZone *config.ZoneConfig + // table is a back reference to the table this index is on. table *Table @@ -669,6 +682,11 @@ func (ti *Index) ForeignKey() (cat.ForeignKeyReference, bool) { return ti.foreignKey, ti.fkSet } +// Zone is part of the cat.Index interface. +func (ti *Index) Zone() cat.Zone { + return ti.IdxZone +} + // Column implements the cat.Column interface for testing purposes. type Column struct { Ordinal int @@ -834,7 +852,7 @@ func (ts *Sequence) SequenceName() *tree.TableName { func (ts *Sequence) String() string { tp := treeprinter.New() - cat.FormatCatalogSequence(ts.Catalog, ts, tp) + cat.FormatSequence(ts.Catalog, ts, tp) return tp.String() } diff --git a/pkg/sql/opt/xform/coster.go b/pkg/sql/opt/xform/coster.go index 3e34966baf19..95a316922567 100644 --- a/pkg/sql/opt/xform/coster.go +++ b/pkg/sql/opt/xform/coster.go @@ -19,11 +19,13 @@ import ( "math" "math/rand" + "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/opt" "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" "github.com/cockroachdb/cockroach/pkg/sql/opt/memo" "github.com/cockroachdb/cockroach/pkg/sql/opt/ordering" "github.com/cockroachdb/cockroach/pkg/sql/opt/props/physical" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" ) // Coster is used by the optimizer to assign a cost to a candidate expression @@ -31,6 +33,21 @@ import ( // expression has a lower cost than any other expression in the memo group, then // it becomes the new best expression for the group. // +// The set of costing formulas maintained by the coster for the set of all +// operators constitute the "cost model". A given cost model can be designed to +// maximize any optimization goal, such as: +// +// 1. Max aggregate cluster throughput (txns/sec across cluster) +// 2. Min transaction latency (time to commit txns) +// 3. Min latency to first row (time to get first row of txns) +// 4. Min memory usage +// 5. Some weighted combination of #1 - #4 +// +// The cost model in this file targets #1 as the optimization goal. However, +// note that #2 is implicitly important to that goal, since overall cluster +// throughput will suffer if there are lots of pending transactions waiting on +// I/O. +// // Coster is an interface so that different costing algorithms can be used by // the optimizer. For example, the OptSteps command uses a custom coster that // assigns infinite costs to some expressions in order to prevent them from @@ -52,6 +69,14 @@ type Coster interface { type coster struct { mem *memo.Memo + // locality gives the location of the current node as a set of user-defined + // key/value pairs, ordered from most inclusive to least inclusive. If there + // are no tiers, then the node's location is not known. Example: + // + // [region=us,dc=east] + // + locality roachpb.Locality + // perturbation indicates how much to randomly perturb the cost. It is used // to generate alternative plans for testing. For example, if perturbation is // 0.5, and the estimated cost of an expression is c, the cost returned by @@ -77,6 +102,18 @@ const ( seqIOCostFactor = 1 randIOCostFactor = 4 + // latencyCostFactor represents the throughput impact of doing scans on an + // index that may be remotely located in a different locality. If latencies + // are higher, then overall cluster throughput will suffer somewhat, as there + // will be more queries in memory blocking on I/O. The impact on throughput + // is expected to be relatively low, so latencyCostFactor is set to a small + // value. However, even a low value will cause the optimizer to prefer + // indexes that are likely to be geographically closer, if they are otherwise + // the same cost to access. + // TODO(andyk): Need to do analysis to figure out right value and/or to come + // up with better way to incorporate latency into the coster. + latencyCostFactor = cpuCostFactor + // hugeCost is used with expressions we want to avoid; these are expressions // that "violate" a hint like forcing a specific index or join algorithm. // If the final expression has this cost or larger, it means that there was no @@ -85,8 +122,9 @@ const ( ) // Init initializes a new coster structure with the given memo. -func (c *coster) Init(mem *memo.Memo, perturbation float64) { +func (c *coster) Init(evalCtx *tree.EvalContext, mem *memo.Memo, perturbation float64) { c.mem = mem + c.locality = evalCtx.Locality c.perturbation = perturbation } @@ -446,16 +484,94 @@ func (c *coster) rowSortCost(numKeyCols int) memo.Cost { return memo.Cost(cost) } +// localityMatchScore returns a number from 0.0 to 1.0 that describes how well +// the current node's locality matches the given zone constraints, with 0.0 +// indicating 0% and 1.0 indicating 100%. In order to match, each successive +// locality tier must match at least one REQUIRED constraint and not match any +// PROHIBITED constraints. Locality tiers are hierarchical, so if a locality +// tier does not match, then tiers after it do not match either. For example: +// +// Locality = [region=us,dc=east] +// 0.0 = [] +// 0.0 = [+region=eu,+dc=uk] +// 0.0 = [-region=us] +// 0.0 = [+region=eu,+dc=east] +// 0.5 = [+region=us,+dc=west] +// 0.5 = [+region=us,-dc=east] +// 1.0 = [+region=us,+dc=east] +// 1.0 = [+region=us,+dc=east,+rack=1,-ssd] +// +// Note that constraints need not be specified in any particular order, so scan +// all constraints when matching each locality tier. +func (c *coster) localityMatchScore(zone cat.Zone) float64 { + // If there are no replica constraints, then locality can't match. + if zone.ReplicaConstraintsCount() == 0 { + return 0.0 + } + + // matchTier returns true if it can locate a required constraint that matches + // the given tier. + matchConstraints := func(zc cat.ReplicaConstraints, tier *roachpb.Tier) bool { + for i, n := 0, zc.ConstraintCount(); i < n; i++ { + con := zc.Constraint(i) + if tier.Key == con.GetKey() && tier.Value == con.GetValue() { + // If this is a required constraint, then it matches, and no need to + // iterate further. If it's prohibited, then it cannot match, so no + // need to go further. + return con.IsRequired() + } + } + return false + } + + // matchReplConstraints returns true if all replica constraints match the + // given tier. + matchReplConstraints := func(zone cat.Zone, tier *roachpb.Tier) bool { + for i, n := 0, zone.ReplicaConstraintsCount(); i < n; i++ { + replCon := zone.ReplicaConstraints(i) + if !matchConstraints(replCon, tier) { + return false + } + } + return true + } + + // Keep iterating until non-matching tier is found, or all tiers are found to + // match. + matchCount := 0 + for i := range c.locality.Tiers { + if !matchReplConstraints(zone, &c.locality.Tiers[i]) { + break + } + matchCount++ + } + + return float64(matchCount) / float64(len(c.locality.Tiers)) +} + // rowScanCost is the CPU cost to scan one row, which depends on the number of // columns in the index and (to a lesser extent) on the number of columns we are // scanning. -func (c *coster) rowScanCost(table opt.TableID, index int, numScannedCols int) memo.Cost { +func (c *coster) rowScanCost(tabID opt.TableID, idxOrd int, numScannedCols int) memo.Cost { md := c.mem.Metadata() - numCols := md.Table(table).Index(index).ColumnCount() + tab := md.Table(tabID) + idx := tab.Index(idxOrd) + numCols := idx.ColumnCount() + + // Adjust cost based on how well the current locality matches the index's + // zone constraints. + var costFactor memo.Cost = cpuCostFactor + if len(c.locality.Tiers) != 0 { + // If 0% of locality tiers have matching constraints, then add additional + // cost. If 100% of locality tiers have matching constraints, then add no + // additional cost. Anything in between is proportional to the number of + // matches. + costFactor += latencyCostFactor * memo.Cost(1.0-c.localityMatchScore(idx.Zone())) + } // The number of the columns in the index matter because more columns means // more data to scan. The number of columns we actually return also matters // because that is the amount of data that we could potentially transfer over // the network. - return memo.Cost(numCols+numScannedCols) * cpuCostFactor + return memo.Cost(numCols+numScannedCols) * costFactor } diff --git a/pkg/sql/opt/xform/optimizer.go b/pkg/sql/opt/xform/optimizer.go index 9da76819cfca..b0d684644cd7 100644 --- a/pkg/sql/opt/xform/optimizer.go +++ b/pkg/sql/opt/xform/optimizer.go @@ -101,7 +101,7 @@ func (o *Optimizer) Init(evalCtx *tree.EvalContext) { o.f.Init(evalCtx) o.mem = o.f.Memo() o.explorer.init(o) - o.defaultCoster.Init(o.mem, evalCtx.TestingKnobs.OptimizerCostPerturbation) + o.defaultCoster.Init(evalCtx, o.mem, evalCtx.TestingKnobs.OptimizerCostPerturbation) o.coster = &o.defaultCoster o.stateMap = make(map[groupStateKey]*groupState) o.matchedRule = nil @@ -879,7 +879,7 @@ func (o *Optimizer) FormatMemo(flags FmtFlags) string { // the real computed cost, not the perturbed cost. func (o *Optimizer) RecomputeCost() { var c coster - c.Init(o.mem, 0 /* perturbation */) + c.Init(o.evalCtx, o.mem, 0 /* perturbation */) root := o.mem.RootExpr() rootProps := o.mem.RootProps() diff --git a/pkg/sql/opt/xform/testdata/coster/zone b/pkg/sql/opt/xform/testdata/coster/zone new file mode 100644 index 000000000000..26432a72067a --- /dev/null +++ b/pkg/sql/opt/xform/testdata/coster/zone @@ -0,0 +1,379 @@ +exec-ddl +CREATE TABLE abc ( + a INT PRIMARY KEY, + b INT, + c STRING, + UNIQUE INDEX bc1 (b, c), + UNIQUE INDEX bc2 (b, c) +) +---- +TABLE abc + ├── a int not null + ├── b int + ├── c string + ├── INDEX primary + │ └── a int not null + ├── INDEX bc1 + │ ├── b int + │ ├── c string + │ └── a int not null (storing) + └── INDEX bc2 + ├── b int + ├── c string + └── a int not null (storing) + +exec-ddl +CREATE TABLE xy ( + x INT PRIMARY KEY, + y INT, + INDEX y1 (y), + INDEX y2 (y) +) +---- +TABLE xy + ├── x int not null + ├── y int + ├── INDEX primary + │ └── x int not null + ├── INDEX y1 + │ ├── y int + │ └── x int not null + └── INDEX y2 + ├── y int + └── x int not null + +# -------------------------------------------------- +# Single constraints. +# -------------------------------------------------- + +exec-ddl +ALTER INDEX abc@bc1 CONFIGURE ZONE USING constraints='[+region=east]' +---- +ZONE + └── constraints: [+region=east] + +exec-ddl +ALTER INDEX abc@bc2 CONFIGURE ZONE USING constraints='[+region=west]' +---- +ZONE + └── constraints: [+region=west] + +# With locality in east, use bc1 index. +opt format=show-all locality=(region=east) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc1 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# With locality in west, use bc2 index. +opt format=show-all locality=(region=west) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc2 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# No locality, so use bc1, since it's first. +opt format=show-all +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc1 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# Locality doesn't match any constraints, so use bc1, since it's first. +opt format=show-all locality=(region=central) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc1 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.9 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# -------------------------------------------------- +# Multiple constraints. +# -------------------------------------------------- + +exec-ddl +ALTER INDEX abc@bc1 CONFIGURE ZONE USING constraints='[+region=us,+dc=east,+rack=1]' +---- +ZONE + └── constraints: [+region=us,+dc=east,+rack=1] + +exec-ddl +ALTER INDEX abc@bc2 CONFIGURE ZONE USING constraints='[+region=us,+dc=west,+rack=1]' +---- +ZONE + └── constraints: [+region=us,+dc=west,+rack=1] + +# With locality in us + east, use bc1 index. +opt format=show-all locality=(region=us,dc=east) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc1 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# With locality in us + west, use bc2 index. +opt format=show-all locality=(region=us,dc=west) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc2 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# Ignore "dc=west,rack=1" match if "region" does not match. +opt format=show-all locality=(region=eu,dc=west,rack=1) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc1 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.9 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# -------------------------------------------------- +# Multiple replica constraints. +# -------------------------------------------------- + +exec-ddl +ALTER INDEX abc@bc1 CONFIGURE ZONE USING constraints='{"+region=us,+dc=east":2, "+region=us,+dc=west":1}' +---- +ZONE + └── replica constraints + ├── 2 replicas: [+region=us,+dc=east] + └── 1 replicas: [+region=us,+dc=west] + +exec-ddl +ALTER INDEX abc@bc2 CONFIGURE ZONE USING constraints='[+region=us,+dc=east]' +---- +ZONE + └── constraints: [+region=us,+dc=east] + +# With locality in us, use bc1 index, since only one tier matches in case of +# both indexes. +opt format=show-all locality=(region=us) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc1 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# With locality in us + east, use bc2 index (use lowest match count when +# replicas have different numbers of matches). +opt format=show-all locality=(region=us,dc=east) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc2 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# -------------------------------------------------- +# Complex constraints. +# -------------------------------------------------- + +exec-ddl +ALTER INDEX abc@bc1 CONFIGURE ZONE USING constraints='[+region=us,-region=eu,+region=ap]' +---- +ZONE + └── constraints: [+region=us,-region=eu,+region=ap] + +exec-ddl +ALTER INDEX abc@bc2 CONFIGURE ZONE USING constraints='[+region=eu,+region=us,+dc=east]' +---- +ZONE + └── constraints: [+region=eu,+region=us,+dc=east] + +# With locality in us, use bc1, since it's first in order. +opt format=show-all locality=(region=us) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc1 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# With locality in eu, use bc2, since it's prohibited with bc1. +opt format=show-all locality=(region=eu) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc2 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# With locality in us + east, use bc2, since it matches both tiers, even though +# "us" match is after "eu" in list. +opt format=show-all locality=(region=us,dc=east) +SELECT b, c FROM abc where b=10 +---- +scan t.public.abc@bc2 + ├── columns: b:2(int!null) c:3(string) + ├── constraint: /2/3: [/10 - /10] + ├── stats: [rows=9.9, distinct(2)=1, null(2)=0] + ├── cost: 10.405 + ├── lax-key: (3) + ├── fd: ()-->(2) + ├── prune: (3) + └── interesting orderings: (+2,+3) + +# -------------------------------------------------- +# Lookup join. +# -------------------------------------------------- + +exec-ddl +ALTER INDEX abc@bc1 CONFIGURE ZONE USING constraints='[+region=us,+dc=east]' +---- +ZONE + └── constraints: [+region=us,+dc=east] + +exec-ddl +ALTER INDEX abc@bc2 CONFIGURE ZONE USING constraints='[+region=us,+dc=west]' +---- +ZONE + └── constraints: [+region=us,+dc=west] + +exec-ddl +ALTER INDEX xy@y1 CONFIGURE ZONE USING constraints='[+region=us,+dc=east]' +---- +ZONE + └── constraints: [+region=us,+dc=east] + +exec-ddl +ALTER INDEX xy@y2 CONFIGURE ZONE USING constraints='[+region=us,+dc=west]' +---- +ZONE + └── constraints: [+region=us,+dc=west] + +# Ensure that both indexes involved in the lookup join are selected from the +# "west" data center. +opt format=show-all locality=(region=us,dc=west) +SELECT * FROM abc INNER LOOKUP JOIN xy ON b=y WHERE b=1 +---- +inner-join (lookup xy@y2) + ├── columns: a:1(int!null) b:2(int!null) c:3(string) x:4(int!null) y:5(int!null) + ├── flags: no-merge-join;no-hash-join + ├── key columns: [2] = [5] + ├── stats: [rows=98.01, distinct(1)=9.9, null(1)=0, distinct(2)=1, null(2)=0, distinct(4)=9.9, null(4)=0, distinct(5)=1, null(5)=0] + ├── cost: 152.0444 + ├── key: (1,4) + ├── fd: ()-->(2,5), (1)-->(3), (2,3)~~>(1), (2)==(5), (5)==(2) + ├── prune: (1,3,4) + ├── interesting orderings: (+1) (+2,+3,+1) + ├── scan t.public.abc@bc2 + │ ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(string) + │ ├── constraint: /2/3: [/1 - /1] + │ ├── stats: [rows=9.9, distinct(1)=9.9, null(1)=0, distinct(2)=1, null(2)=0] + │ ├── cost: 10.504 + │ ├── key: (1) + │ ├── fd: ()-->(2), (1)-->(3), (2,3)~~>(1) + │ ├── prune: (1,3) + │ └── interesting orderings: (+1) (+2,+3,+1) + └── filters + └── eq [type=bool, outer=(5), constraints=(/5: [/1 - /1]; tight), fd=()-->(5)] + ├── variable: t.public.xy.y [type=int] + └── const: 1 [type=int] + +# Switch the data center for the target lookup join index. + +exec-ddl +ALTER INDEX xy@y1 CONFIGURE ZONE USING constraints='[+region=us,+dc=west]' +---- +ZONE + └── constraints: [+region=us,+dc=west] + +exec-ddl +ALTER INDEX xy@y2 CONFIGURE ZONE USING constraints='[+region=us,+dc=east]' +---- +ZONE + └── constraints: [+region=us,+dc=east] + +# Should use other index now. +opt format=show-all locality=(region=us,dc=west) +SELECT * FROM abc INNER LOOKUP JOIN xy ON b=y WHERE b=1 +---- +inner-join (lookup xy@y1) + ├── columns: a:1(int!null) b:2(int!null) c:3(string) x:4(int!null) y:5(int!null) + ├── flags: no-merge-join;no-hash-join + ├── key columns: [2] = [5] + ├── stats: [rows=98.01, distinct(1)=9.9, null(1)=0, distinct(2)=1, null(2)=0, distinct(4)=9.9, null(4)=0, distinct(5)=1, null(5)=0] + ├── cost: 152.0444 + ├── key: (1,4) + ├── fd: ()-->(2,5), (1)-->(3), (2,3)~~>(1), (2)==(5), (5)==(2) + ├── prune: (1,3,4) + ├── interesting orderings: (+1) (+2,+3,+1) + ├── scan t.public.abc@bc2 + │ ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(string) + │ ├── constraint: /2/3: [/1 - /1] + │ ├── stats: [rows=9.9, distinct(1)=9.9, null(1)=0, distinct(2)=1, null(2)=0] + │ ├── cost: 10.504 + │ ├── key: (1) + │ ├── fd: ()-->(2), (1)-->(3), (2,3)~~>(1) + │ ├── prune: (1,3) + │ └── interesting orderings: (+1) (+2,+3,+1) + └── filters + └── eq [type=bool, outer=(5), constraints=(/5: [/1 - /1]; tight), fd=()-->(5)] + ├── variable: t.public.xy.y [type=int] + └── const: 1 [type=int] diff --git a/pkg/sql/opt_catalog.go b/pkg/sql/opt_catalog.go index 7afd99a72cad..3c2759e619d5 100644 --- a/pkg/sql/opt_catalog.go +++ b/pkg/sql/opt_catalog.go @@ -19,6 +19,7 @@ import ( "math" "time" + "github.com/cockroachdb/cockroach/pkg/config" "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/privilege" @@ -33,10 +34,12 @@ import ( // only include what the optimizer needs, and certain common lookups are cached // for faster performance. type optCatalog struct { - // resolver needs to be set via a call to init before calling other methods. - resolver LogicalSchema + // planner needs to be set via a call to init before calling other methods. + planner *planner - statsCache *stats.TableStatisticsCache + // cfg is the gossiped and cached system config. It may be nil if the node + // does not yet have it available. + cfg *config.SystemConfig // dataSources is a cache of table and view objects that's used to satisfy // repeated calls for the same data source. The same underlying descriptor @@ -50,10 +53,14 @@ type optCatalog struct { var _ cat.Catalog = &optCatalog{} // init allows the caller to pre-allocate optCatalog. -func (oc *optCatalog) init(statsCache *stats.TableStatisticsCache, resolver LogicalSchema) { - oc.resolver = resolver - oc.statsCache = statsCache +func (oc *optCatalog) init(planner *planner) { + oc.planner = planner oc.dataSources = nil + + // Gossip can be nil in testing scenarios. + if planner.execCfg.Gossip != nil { + oc.cfg = planner.execCfg.Gossip.GetSystemConfig() + } } // optSchema is a wrapper around sqlbase.DatabaseDescriptor that implements the @@ -84,9 +91,10 @@ func (os *optSchema) Name() *cat.SchemaName { func (oc *optCatalog) ResolveSchema( ctx context.Context, name *cat.SchemaName, ) (cat.Schema, cat.SchemaName, error) { - p := oc.resolver.(*planner) - defer func(prev bool) { p.avoidCachedDescriptors = prev }(p.avoidCachedDescriptors) - p.avoidCachedDescriptors = true + defer func(prev bool) { + oc.planner.avoidCachedDescriptors = prev + }(oc.planner.avoidCachedDescriptors) + oc.planner.avoidCachedDescriptors = true // ResolveTargetObject wraps ResolveTarget in order to raise "schema not // found" and "schema cannot be modified" errors. However, ResolveTargetObject @@ -97,9 +105,9 @@ func (oc *optCatalog) ResolveSchema( oc.tn.TableNamePrefix = *name found, desc, err := oc.tn.ResolveTarget( ctx, - oc.resolver, - oc.resolver.CurrentDatabase(), - oc.resolver.CurrentSearchPath(), + oc.planner, + oc.planner.CurrentDatabase(), + oc.planner.CurrentSearchPath(), ) if err != nil { return nil, cat.SchemaName{}, err @@ -116,7 +124,7 @@ func (oc *optCatalog) ResolveDataSource( ctx context.Context, name *cat.DataSourceName, ) (cat.DataSource, cat.DataSourceName, error) { oc.tn = *name - desc, err := ResolveExistingObject(ctx, oc.resolver, &oc.tn, true /* required */, anyDescType) + desc, err := ResolveExistingObject(ctx, oc.planner, &oc.tn, true /* required */, anyDescType) if err != nil { return nil, cat.DataSourceName{}, err } @@ -131,7 +139,7 @@ func (oc *optCatalog) ResolveDataSource( func (oc *optCatalog) ResolveDataSourceByID( ctx context.Context, dataSourceID cat.StableID, ) (cat.DataSource, error) { - tableLookup, err := oc.resolver.LookupTableByID(ctx, sqlbase.ID(dataSourceID)) + tableLookup, err := oc.planner.LookupTableByID(ctx, sqlbase.ID(dataSourceID)) if err != nil || tableLookup.IsAdding { if err == sqlbase.ErrDescriptorNotFound || tableLookup.IsAdding { @@ -141,7 +149,7 @@ func (oc *optCatalog) ResolveDataSourceByID( } desc := tableLookup.Desc - dbDesc, err := sqlbase.GetDatabaseDescFromID(ctx, oc.resolver.Txn(), desc.ParentID) + dbDesc, err := sqlbase.GetDatabaseDescFromID(ctx, oc.planner.Txn(), desc.ParentID) if err != nil { return nil, err } @@ -154,13 +162,13 @@ func (oc *optCatalog) ResolveDataSourceByID( func (oc *optCatalog) CheckPrivilege(ctx context.Context, o cat.Object, priv privilege.Kind) error { switch t := o.(type) { case *optSchema: - return oc.resolver.CheckPrivilege(ctx, t.desc, priv) + return oc.planner.CheckPrivilege(ctx, t.desc, priv) case *optTable: - return oc.resolver.CheckPrivilege(ctx, t.desc, priv) + return oc.planner.CheckPrivilege(ctx, t.desc, priv) case *optView: - return oc.resolver.CheckPrivilege(ctx, t.desc, priv) + return oc.planner.CheckPrivilege(ctx, t.desc, priv) case *optSequence: - return oc.resolver.CheckPrivilege(ctx, t.desc, priv) + return oc.planner.CheckPrivilege(ctx, t.desc, priv) default: return pgerror.NewAssertionErrorf("invalid object type: %T", o) } @@ -199,7 +207,7 @@ func (oc *optCatalog) newDataSource( // all databases. We treat the empty catalog as having database ID 0. if name.Catalog() != "" { // TODO(radu): it's unfortunate that we have to lookup the schema again. - _, dbDesc, err := oc.resolver.LookupSchema(ctx, name.Catalog(), name.Schema()) + _, dbDesc, err := oc.planner.LookupSchema(ctx, name.Catalog(), name.Schema()) if err != nil { return nil, err } @@ -221,14 +229,17 @@ func (oc *optCatalog) newDataSource( } } - stats, err := oc.statsCache.GetTableStats(context.TODO(), desc.ID) + stats, err := oc.planner.execCfg.TableStatsCache.GetTableStats(context.TODO(), desc.ID) if err != nil { // Ignore any error. We still want to be able to run queries even if we lose // access to the statistics table. // TODO(radu): at least log the error. stats = nil } - ds = newOptTable(desc, id, name, stats) + ds, err = newOptTable(oc.cfg, desc, id, name, stats) + if err != nil { + return nil, err + } case desc.IsView(): ds = newOptView(desc, name) @@ -389,11 +400,12 @@ type optTable struct { var _ cat.Table = &optTable{} func newOptTable( + cfg *config.SystemConfig, desc *sqlbase.ImmutableTableDescriptor, id cat.StableID, name *cat.DataSourceName, stats []*stats.TableStatistic, -) *optTable { +) (*optTable, error) { ot := &optTable{desc: desc, id: id, name: *name} // The cat.Table interface requires that table names be fully qualified. @@ -407,6 +419,20 @@ func newOptTable( } if !ot.desc.IsVirtualTable() { + // Lookup table's zone if system config is available (it may not be as node + // is starting up and before it's received the gossiped config). If it is + // not available, use an empty config that has no zone constraints. + var tblZone *config.ZoneConfig + if cfg != nil { + var err error + tblZone, err = lookupZone(cfg, uint32(id)) + if err != nil { + return nil, err + } + } else { + tblZone = &config.ZoneConfig{} + } + // Build the indexes (add 1 to account for lack of primary index in // DeletableIndexes slice). ot.indexes = make([]optIndex, 1+len(ot.desc.DeletableIndexes())) @@ -418,7 +444,19 @@ func newOptTable( } else { idxDesc = &ot.desc.DeletableIndexes()[i-1] } - ot.indexes[i].init(ot, idxDesc) + + // If there is a subzone that applies to the entire index, use that, + // else use the table zone. Skip subzones that apply to partitions, + // since they apply only to a subset of the index. + idxZone := tblZone + for j := range tblZone.Subzones { + subzone := &tblZone.Subzones[j] + if subzone.IndexID == uint32(idxDesc.ID) && subzone.PartitionName == "" { + idxZone = &subzone.Config + } + } + + ot.indexes[i].init(ot, idxDesc, idxZone) } } @@ -452,7 +490,7 @@ func newOptTable( ot.stats = ot.stats[:n] } - return ot + return ot, nil } // ID is part of the cat.Object interface. @@ -469,6 +507,7 @@ func (ot *optTable) Equals(other cat.Object) bool { if ot.id != otherTable.id || ot.desc.Version != otherTable.desc.Version { return false } + // Verify the stats are identical. if len(ot.stats) != len(otherTable.stats) { return false @@ -478,6 +517,23 @@ func (ot *optTable) Equals(other cat.Object) bool { return false } } + + // Verify that indexes are in same zones. For performance, skip deep equality + // check if it's the same as the previous index (common case). + var prevLeftZone, prevRightZone *config.ZoneConfig + for i := range ot.indexes { + leftZone := ot.indexes[i].zone + rightZone := otherTable.indexes[i].zone + if leftZone == prevLeftZone && rightZone == prevRightZone { + continue + } + if !zonesAreEqual(leftZone, rightZone) { + return false + } + prevLeftZone = leftZone + prevRightZone = rightZone + } + return true } @@ -608,6 +664,8 @@ func (ot *optTable) lookupColumnOrdinal(colID sqlbase.ColumnID) (int, error) { type optIndex struct { tab *optTable desc *sqlbase.IndexDescriptor + zone *config.ZoneConfig + // storedCols is the set of non-PK columns if this is the primary index, // otherwise it is desc.StoreColumnIDs. storedCols []sqlbase.ColumnID @@ -625,9 +683,10 @@ var _ cat.Index = &optIndex{} // init can be used instead of newOptIndex when we have a pre-allocated instance // (e.g. as part of a bigger struct). -func (oi *optIndex) init(tab *optTable, desc *sqlbase.IndexDescriptor) { +func (oi *optIndex) init(tab *optTable, desc *sqlbase.IndexDescriptor, zone *config.ZoneConfig) { oi.tab = tab oi.desc = desc + oi.zone = zone if desc == &tab.desc.PrimaryIndex { // Although the primary index contains all columns in the table, the index // descriptor does not contain columns that are not explicitly part of the @@ -757,6 +816,11 @@ func (oi *optIndex) ForeignKey() (cat.ForeignKeyReference, bool) { return oi.foreignKey, oi.desc.ForeignKey.IsSet() } +// Zone is part of the cat.Index interface. +func (oi *optIndex) Zone() cat.Zone { + return oi.zone +} + // Table is part of the cat.Index interface. func (oi *optIndex) Table() cat.Table { return oi.tab @@ -875,3 +939,48 @@ func (oi *optFamily) Column(i int) cat.FamilyColumn { func (oi *optFamily) Table() cat.Table { return oi.tab } + +// lookupZone returns the ZoneConfig data structure for the given schema object +// ID. ZoneConfigs are stored in protobuf binary format in the SystemConfig, +// which is gossiped around the cluster. Note that the returned ZoneConfig might +// be somewhat stale, since it's taken from the gossiped SystemConfig. +func lookupZone(cfg *config.SystemConfig, id uint32) (*config.ZoneConfig, error) { + zone, _, _, err := ZoneConfigHook(cfg, id) + if err != nil { + return nil, err + } + return zone, nil +} + +// zonesAreEqual compares two zones for equality. Note that only fields actually +// exposed by the cat.Zone interface and needed by the optimizer are compared. +func zonesAreEqual(left, right *config.ZoneConfig) bool { + if len(left.Constraints) != len(right.Constraints) { + return false + } + for i := range left.Constraints { + leftReplCons := &left.Constraints[i] + rightReplCons := &right.Constraints[i] + if leftReplCons.NumReplicas != rightReplCons.NumReplicas { + return false + } + if len(leftReplCons.Constraints) != len(rightReplCons.Constraints) { + return false + } + + for j := range leftReplCons.Constraints { + leftCons := &leftReplCons.Constraints[j] + rightCons := &rightReplCons.Constraints[j] + if leftCons.Type != rightCons.Type { + return false + } + if leftCons.Key != rightCons.Key { + return false + } + if leftCons.Value != rightCons.Value { + return false + } + } + } + return true +} diff --git a/pkg/sql/plan_opt.go b/pkg/sql/plan_opt.go index cfed84dab505..259c6d975bb2 100644 --- a/pkg/sql/plan_opt.go +++ b/pkg/sql/plan_opt.go @@ -208,7 +208,7 @@ type optPlanningCtx struct { func (opc *optPlanningCtx) init(p *planner, AST tree.Statement) { opc.p = p - opc.catalog.init(p.execCfg.TableStatsCache, p) + opc.catalog.init(p) p.optimizer.Init(p.EvalContext()) opc.flags = planFlagOptUsed diff --git a/pkg/sql/planner.go b/pkg/sql/planner.go index b49850cc6d22..9829d6c1ae05 100644 --- a/pkg/sql/planner.go +++ b/pkg/sql/planner.go @@ -261,6 +261,7 @@ func newInternalPlanner( p.extendedEvalCtx.Sequence = p p.extendedEvalCtx.ClusterID = execCfg.ClusterID() p.extendedEvalCtx.NodeID = execCfg.NodeID.Get() + p.extendedEvalCtx.Locality = execCfg.Locality p.sessionDataMutator = dataMutator p.autoCommit = false diff --git a/pkg/sql/sem/tree/eval.go b/pkg/sql/sem/tree/eval.go index f5ab159a111f..49ed501539df 100644 --- a/pkg/sql/sem/tree/eval.go +++ b/pkg/sql/sem/tree/eval.go @@ -2489,6 +2489,15 @@ type EvalContext struct { Settings *cluster.Settings ClusterID uuid.UUID NodeID roachpb.NodeID + + // Locality contains the location of the current node as a set of user-defined + // key/value pairs, ordered from most inclusive to least inclusive. If there + // are no tiers, then the node's location is not known. Example: + // + // [region=us,dc=east] + // + Locality roachpb.Locality + // The statement timestamp. May be different for every statement. // Used for statement_timestamp(). StmtTimestamp time.Time diff --git a/pkg/sql/zone_config.go b/pkg/sql/zone_config.go index 9fc9a1bebc0f..98d80678890e 100644 --- a/pkg/sql/zone_config.go +++ b/pkg/sql/zone_config.go @@ -106,12 +106,12 @@ func getZoneConfig( return 0, nil, 0, nil, errNoZoneConfigApplies } -// CompleteZoneConfig takes a zone config pointer and fills in the +// completeZoneConfig takes a zone config pointer and fills in the // missing fields by following the chain of inheritance. // In the worst case, will have to inherit from the default zone config. // NOTE: This will not work for subzones. To complete subzones, find a complete // parent zone (index or table) and apply InheritFromParent to it. -func CompleteZoneConfig( +func completeZoneConfig( cfg *config.ZoneConfig, id uint32, getKey func(roachpb.Key) (*roachpb.Value, error), ) error { if cfg.IsComplete() { @@ -163,7 +163,7 @@ func ZoneConfigHook( } else if err != nil { return nil, nil, false, err } - if err = CompleteZoneConfig(zone, zoneID, getKey); err != nil { + if err = completeZoneConfig(zone, zoneID, getKey); err != nil { return nil, nil, false, err } return zone, placeholder, true, nil @@ -191,7 +191,7 @@ func GetZoneConfigInTxn( if err != nil { return 0, nil, nil, err } - if err = CompleteZoneConfig(zone, zoneID, getKey); err != nil { + if err = completeZoneConfig(zone, zoneID, getKey); err != nil { return 0, nil, nil, err } var subzone *config.Subzone diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index 281fc6bb4461..d7d7a43831ba 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -151,6 +151,17 @@ func StartTestCluster(t testing.TB, nodes int, args base.TestClusterArgs) *TestC } else { serverArgs = args.ServerArgs } + + // If there are multiple nodes, place them in different localities by + // default. + if nodes > 0 { + tiers := []roachpb.Tier{ + {Key: "region", Value: "test"}, + {Key: "dc", Value: fmt.Sprintf("dc%d", i+1)}, + } + serverArgs.Locality = roachpb.Locality{Tiers: tiers} + } + if i > 0 { serverArgs.JoinAddr = tc.Servers[0].ServingAddr() }