From 50fc5666ed362a12bc3d132590e55d2973bb78ed Mon Sep 17 00:00:00 2001 From: Yahor Yuzefovich Date: Thu, 21 May 2020 12:51:52 -0700 Subject: [PATCH] roachtest: run EXPLAIN ANALYZE when tpchvec/perf fails We have a mysterious rare failure on query 7 in which `vectorize=on` performs significantly worse than `vectorize=off`. I'm out of possible explanations for this, so this commit adds an ability to run `EXPLAIN ANALYZE` on the query with both `vectorize` options when the slowness threshold is exceeded. Hopefully it'll give us some insight into the perf failures. Release note: None --- pkg/cmd/roachtest/tpchvec.go | 58 +++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/pkg/cmd/roachtest/tpchvec.go b/pkg/cmd/roachtest/tpchvec.go index d7728a306e1b..637b49d0d085 100644 --- a/pkg/cmd/roachtest/tpchvec.go +++ b/pkg/cmd/roachtest/tpchvec.go @@ -25,12 +25,12 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/binfetcher" "github.com/cockroachdb/cockroach/pkg/util/randutil" + "github.com/cockroachdb/cockroach/pkg/workload/tpch" "github.com/cockroachdb/errors" ) const ( - tpchVecNodeCount = 3 - tpchVecNumQueries = 22 + tpchVecNodeCount = 3 ) type crdbVersion int @@ -53,13 +53,19 @@ func toCRDBVersion(v string) (crdbVersion, error) { } } -var ( - vectorizeOnOptionByVersion = map[crdbVersion]string{ - tpchVecVersion19_2: "experimental_on", - tpchVecVersion20_1: "on", - tpchVecVersion20_2: "on", +func vectorizeOptionToSetting(vectorize bool, version crdbVersion) string { + if !vectorize { + return "off" + } + switch version { + case tpchVecVersion19_2: + return "experimental_on" + default: + return "on" } +} +var ( // queriesToSkipByVersion is a map keyed by version that contains query numbers // to be skipped for the given version (as well as the reasons for why they are skipped). queriesToSkipByVersion = map[crdbVersion]map[int]string{ @@ -209,10 +215,10 @@ func (p *tpchVecPerfTest) postQueryRunHook(t *test, output []byte, vectorized bo } } -func (p *tpchVecPerfTest) postTestRunHook(t *test, _ *gosql.DB, version crdbVersion) { +func (p *tpchVecPerfTest) postTestRunHook(t *test, conn *gosql.DB, version crdbVersion) { queriesToSkip := queriesToSkipByVersion[version] t.Status("comparing the runtimes (only median values for each query are compared)") - for queryNum := 1; queryNum <= tpchVecNumQueries; queryNum++ { + for queryNum := 1; queryNum <= tpch.NumQueries; queryNum++ { if _, skipped := queriesToSkip[queryNum]; skipped { continue } @@ -248,6 +254,33 @@ func (p *tpchVecPerfTest) postTestRunHook(t *test, _ *gosql.DB, version crdbVers vecOnTime, vecOffTime, vecOnTimes, vecOffTimes)) } if vecOnTime >= slownessThresholdByVersion[version]*vecOffTime { + // For some reason, the vectorized engine executed the query a lot + // slower than the row-by-row engine which is unexpected. In order + // to understand where the slowness comes from, we will run EXPLAIN + // ANALYZE of the query with all `vectorize` options + // tpchPerfTestNumRunsPerQuery times (hoping at least one will + // "catch" the slowness). + for _, vectorize := range p.vectorizeOptions() { + vectorizeSetting := vectorizeOptionToSetting(vectorize, version) + if _, err := conn.Exec(fmt.Sprintf("SET vectorize=%s;", vectorizeSetting)); err != nil { + t.Fatal(err) + } + for i := 0; i < tpchPerfTestNumRunsPerQuery; i++ { + rows, err := conn.Query(fmt.Sprintf( + "SELECT url FROM [EXPLAIN ANALYZE %s];", tpch.QueriesByNumber[queryNum], + )) + if err != nil { + t.Fatal(err) + } + defer rows.Close() + var url string + rows.Next() + if err = rows.Scan(&url); err != nil { + t.Fatal(err) + } + t.Status(fmt.Sprintf("EXPLAIN ANALYZE with vectorize=%s url:\n%s", vectorizeSetting, url)) + } + } t.Fatal(fmt.Sprintf( "[q%d] vec ON is slower by %.2f%% than vec OFF\n"+ "vec ON times: %v\nvec OFF times: %v", @@ -303,16 +336,13 @@ func baseTestRun( ) { firstNode := c.Node(1) queriesToSkip := queriesToSkipByVersion[version] - for queryNum := 1; queryNum <= tpchVecNumQueries; queryNum++ { + for queryNum := 1; queryNum <= tpch.NumQueries; queryNum++ { for _, vectorize := range tc.vectorizeOptions() { if reason, skip := queriesToSkip[queryNum]; skip { t.Status(fmt.Sprintf("skipping q%d because of %q", queryNum, reason)) continue } - vectorizeSetting := "off" - if vectorize { - vectorizeSetting = vectorizeOnOptionByVersion[version] - } + vectorizeSetting := vectorizeOptionToSetting(vectorize, version) cmd := fmt.Sprintf("./workload run tpch --concurrency=1 --db=tpch "+ "--max-ops=%d --queries=%d --vectorize=%s {pgurl:1-%d}", tc.numRunsPerQuery(), queryNum, vectorizeSetting, tpchVecNodeCount)