Skip to content

Commit

Permalink
Merge #88396 #88444
Browse files Browse the repository at this point in the history
88396: opt: fix node-crashing panics with correlated With exprs r=mgartner a=mgartner

#### opt: prevent apply-join panics from crashing nodes

Previously, it was possible for the execution of an apply-join to crash
a node due to an uncaught optimizer panic when calling the
`planRightSideFn` closure. This closure is invoked for every input row
to the apply-join. It replaces variables in the expression on the right
side of the join with constants using `Factory.CopyAndReplace`, which
can panic. This panic won't be caught by the panic-catching logic in
`Optimizer.Optimize` because the closure is invoked outside the context
of `Optimizer.Optimize` - it's occurring during execution instead.

This commit copies the panic-catching logic of `Optimizer.Optimize` to
the apply-join's `planRightSideFn` closure to ensure that any panics are
caught.

Release Note (bug fix): A bug has been fixed that could cause nodes to
crash in rare cases when executing apply-joins in query plans.

#### opt: fix transitive references to With exprs in RHS of apply-join

This commit fixes an error that could occur when the RHS of an
apply-join referenced a With expression transitive through another With
expression. The error occurred because the optimizer could not access
the relational properties of the transitively referenced With expression
because the With was not added to the metadata. The commit fixes the
issue by adding all With expressions to the metadata if any With
expressions are referenced.

Fixes #87733

Release note (bug fix): A bug has been fixed that caused errors in rare
cases when executing queries with correlated WITH expressions. This bug
was present since correlated WITH expressions were introduced in
v21.2.0.


88444: roachtest: use highmem instances for some tests  r=tbg a=erikgrinaker

**roachtest: add `HighMem` option**

This patch adds a `HighMem` cluster spec option, which will use machine
types with increased memory. There are no changes to existing machine
types.

Release note: None
  
**roachtest: use highmem instances for some tests**

The following tests have been seen to OOM occasionally. Since a root
cause fix isn't planned for the near term, this serves to deflake the
tests.

* `tpccbench/nodes=9/cpu=4/multi-region`
* `restore2TB/*`

Resolves #87809.

Release note: None

Co-authored-by: Marcus Gartner <[email protected]>
Co-authored-by: Erik Grinaker <[email protected]>
  • Loading branch information
3 people committed Sep 22, 2022
3 parents f08a1b0 + 5718b5f + bbca64c commit 30ba7c3
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 27 deletions.
10 changes: 7 additions & 3 deletions pkg/cmd/roachtest/spec/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ type ClusterSpec struct {
NodeCount int
// CPUs is the number of CPUs per node.
CPUs int
HighMem bool
SSDs int
RAID0 bool
VolumeSize int
Expand Down Expand Up @@ -82,6 +83,9 @@ func ClustersCompatible(s1, s2 ClusterSpec) bool {
// String implements fmt.Stringer.
func (s ClusterSpec) String() string {
str := fmt.Sprintf("n%dcpu%d", s.NodeCount, s.CPUs)
if s.HighMem {
str += "m"
}
if s.Geo {
str += "-Geo"
}
Expand Down Expand Up @@ -191,11 +195,11 @@ func (s *ClusterSpec) RoachprodOpts(
// based on the cloud and CPU count.
switch s.Cloud {
case AWS:
machineType = AWSMachineType(s.CPUs)
machineType = AWSMachineType(s.CPUs, s.HighMem)
case GCE:
machineType = GCEMachineType(s.CPUs)
machineType = GCEMachineType(s.CPUs, s.HighMem)
case Azure:
machineType = AzureMachineType(s.CPUs)
machineType = AzureMachineType(s.CPUs, s.HighMem)
}
}

Expand Down
49 changes: 35 additions & 14 deletions pkg/cmd/roachtest/spec/machine_type.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,63 @@ package spec
import "fmt"

// AWSMachineType selects a machine type given the desired number of CPUs.
func AWSMachineType(cpus int) string {
func AWSMachineType(cpus int, highmem bool) string {
// TODO(erikgrinaker): These have significantly less RAM than
// their GCE counterparts. Consider harmonizing them.
family := "c5d" // 2 GB RAM per CPU
if highmem {
family = "m5d" // 4 GB RAM per CPU
}

var size string
switch {
case cpus <= 2:
return "c5d.large"
size = "large"
case cpus <= 4:
return "c5d.xlarge"
size = "xlarge"
case cpus <= 8:
return "c5d.2xlarge"
size = "2xlarge"
case cpus <= 16:
return "c5d.4xlarge"
size = "4xlarge"
case cpus <= 36:
return "c5d.9xlarge"
size = "9xlarge"
case cpus <= 72:
return "c5d.18xlarge"
size = "18xlarge"
case cpus <= 96:
// There is no c5d.24xlarge.
return "m5d.24xlarge"
size = "24xlarge"
default:
panic(fmt.Sprintf("no aws machine type with %d cpus", cpus))
}

// There is no c5d.24xlarge.
if family == "c5d" && size == "24xlarge" {
family = "m5d"
}

return fmt.Sprintf("%s.%s", family, size)
}

// GCEMachineType selects a machine type given the desired number of CPUs.
func GCEMachineType(cpus int) string {
func GCEMachineType(cpus int, highmem bool) string {
// TODO(peter): This is awkward: at or below 16 cpus, use n1-standard so that
// the machines have a decent amount of RAM. We could use custom machine
// configurations, but the rules for the amount of RAM per CPU need to be
// determined (you can't request any arbitrary amount of RAM).
if cpus <= 16 {
return fmt.Sprintf("n1-standard-%d", cpus)
series := "n1"
kind := "standard" // 3.75 GB RAM per CPU
if highmem {
kind = "highmem" // 6.5 GB RAM per CPU
} else if cpus > 16 {
kind = "highcpu" // 0.9 GB RAM per CPU
}
return fmt.Sprintf("n1-highcpu-%d", cpus)
return fmt.Sprintf("%s-%s-%d", series, kind, cpus)
}

// AzureMachineType selects a machine type given the desired number of CPUs.
func AzureMachineType(cpus int) string {
func AzureMachineType(cpus int, highmem bool) string {
if highmem {
panic("highmem not implemented for Azure")
}
switch {
case cpus <= 2:
return "Standard_D2_v3"
Expand Down
11 changes: 11 additions & 0 deletions pkg/cmd/roachtest/spec/option.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@ func CPU(n int) Option {
return nodeCPUOption(n)
}

type nodeHighMemOption bool

func (o nodeHighMemOption) apply(spec *ClusterSpec) {
spec.HighMem = bool(o)
}

// HighMem requests nodes with additional memory per CPU.
func HighMem(enabled bool) Option {
return nodeHighMemOption(enabled)
}

type volumeSizeOption int

func (o volumeSizeOption) apply(spec *ClusterSpec) {
Expand Down
2 changes: 2 additions & 0 deletions pkg/cmd/roachtest/tests/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,8 @@ func registerRestore(r registry.Registry) {
clusterOpts = append(clusterOpts, spec.VolumeSize(largeVolumeSize))
testName += fmt.Sprintf("/pd-volume=%dGB", largeVolumeSize)
}
// Has been seen to OOM: https://github.com/cockroachdb/cockroach/issues/71805
clusterOpts = append(clusterOpts, spec.HighMem(true))

r.Add(registry.TestSpec{
Name: testName,
Expand Down
4 changes: 3 additions & 1 deletion pkg/cmd/roachtest/tests/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,7 @@ func registerTPCC(r registry.Registry) {
registerTPCCBenchSpec(r, tpccBenchSpec{
Nodes: 9,
CPUs: 4,
HighMem: true, // can OOM otherwise: https://github.com/cockroachdb/cockroach/issues/73376
Distribution: multiRegion,
LoadConfig: multiLoadgen,

Expand Down Expand Up @@ -835,6 +836,7 @@ type tpccBenchSpec struct {
Owner registry.Owner // defaults to Test-Eng
Nodes int
CPUs int
HighMem bool
Chaos bool
AdmissionControlDisabled bool
Distribution tpccBenchDistribution
Expand Down Expand Up @@ -901,7 +903,7 @@ func registerTPCCBenchSpec(r registry.Registry, b tpccBenchSpec) {
nameParts = append(nameParts, "no-admission")
}

opts := []spec.Option{spec.CPU(b.CPUs)}
opts := []spec.Option{spec.CPU(b.CPUs), spec.HighMem(b.HighMem)}
switch b.Distribution {
case singleZone:
// No specifier.
Expand Down
26 changes: 26 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/apply_join
Original file line number Diff line number Diff line change
Expand Up @@ -578,3 +578,29 @@ WHERE
)
AS tab_397813 (col_664948, col_664949) ON (tab_397806.col_664940) = (tab_397813.col_664948)
);

subtest regression_87733

statement ok
CREATE TABLE t87733a (a INT);
CREATE TABLE t87733b (b INT);
INSERT INTO t87733a VALUES (1)

# Regression test for #87733. Do not panic when planning the RHS of an
# apply-join that refers to a With expression transitively through another With
# expression.
query T
WITH
t1 AS (SELECT a FROM t87733a),
t2 AS MATERIALIZED (SELECT a, b FROM t1 JOIN t87733b ON true)
SELECT NULL
FROM t1
LEFT JOIN LATERAL (
WITH t3 AS (SELECT * FROM t2 WHERE t2.a = t1.a)
SELECT array_agg(CASE WHEN v = '' THEN b END)
FROM (
SELECT '' AS v, b FROM t3 ORDER BY b DESC
)
) ON true;
----
NULL
1 change: 1 addition & 0 deletions pkg/sql/opt/exec/execbuilder/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ go_library(
"//pkg/util/encoding",
"//pkg/util/errorutil",
"//pkg/util/errorutil/unimplemented",
"//pkg/util/log",
"//pkg/util/timeutil",
"//pkg/util/treeprinter",
"@com_github_cockroachdb_errors//:errors",
Expand Down
54 changes: 45 additions & 9 deletions pkg/sql/opt/exec/execbuilder/relational.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ import (
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/errorutil"
"github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
Expand Down Expand Up @@ -1024,12 +1026,35 @@ func (b *Builder) buildApplyJoin(join memo.RelExpr) (execPlan, error) {
//
// Note: we put o outside of the function so we allocate it only once.
var o xform.Optimizer
planRightSideFn := func(ctx context.Context, ef exec.Factory, leftRow tree.Datums) (exec.Plan, error) {
planRightSideFn := func(ctx context.Context, ef exec.Factory, leftRow tree.Datums) (_ exec.Plan, err error) {
defer func() {
if r := recover(); r != nil {
// This code allows us to propagate internal errors without having to add
// error checks everywhere throughout the code. This is only possible
// because the code does not update shared state and does not manipulate
// locks.
//
// This is the same panic-catching logic that exists in
// o.Optimize() below. It's required here because it's possible
// for factory functions to panic below, like
// CopyAndReplaceDefault.
if ok, e := errorutil.ShouldCatch(r); ok {
err = e
log.VEventf(ctx, 1, "%v", err)
} else {
// Other panic objects can't be considered "safe" and thus are
// propagated as crashes that terminate the session.
panic(r)
}
}
}()

o.Init(ctx, b.evalCtx, b.catalog)
f := o.Factory()

// Copy the right expression into a new memo, replacing each bound column
// with the corresponding value from the left row.
addedWithBindings := false
var replaceFn norm.ReplaceFunc
replaceFn = func(e opt.Expr) opt.Expr {
switch t := e.(type) {
Expand All @@ -1039,15 +1064,26 @@ func (b *Builder) buildApplyJoin(join memo.RelExpr) (execPlan, error) {
}

case *memo.WithScanExpr:
// Allow referring to "outer" With expressions. The bound expressions
// are not part of this Memo but they are used only for their relational
// properties, which should be valid.
for i := range withExprs {
if withExprs[i].id == t.With {
memoExpr := b.mem.Metadata().WithBinding(t.With)
f.Metadata().AddWithBinding(t.With, memoExpr)
break
// Allow referring to "outer" With expressions. The bound
// expressions are not part of this Memo, but they are used only
// for their relational properties, which should be valid.
//
// We must add all With expressions to the metadata even if they
// aren't referred to directly because they might be referred to
// transitively through other With expressions. For example, if
// the RHS refers to With expression &1, and &1 refers to With
// expression &2, we must include &2 in the metadata so that
// its relational properties are available. See #87733.
//
// We lazily add these With expressions to the metadata here
// because the call to Factory.CopyAndReplace below clears With
// expressions in the metadata.
if !addedWithBindings {
for i, n := opt.WithID(1), b.mem.MaxWithID(); i <= n; i++ {
memoExpr := b.mem.Metadata().WithBinding(i)
f.Metadata().AddWithBinding(i, memoExpr)
}
addedWithBindings = true
}
// Fall through.
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/opt/memo/memo.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,12 @@ func (m *Memo) NextWithID() opt.WithID {
return m.curWithID
}

// MaxWithID returns the current maximum assigned identifier for a WITH
// expression.
func (m *Memo) MaxWithID() opt.WithID {
return m.curWithID
}

// Detach is used when we detach a memo that is to be reused later (either for
// execbuilding or with AssignPlaceholders). New expressions should no longer be
// constructed in this memo.
Expand Down

0 comments on commit 30ba7c3

Please sign in to comment.