From c3d4b166a1991b983cdd0938d2cd0fdb888fdf55 Mon Sep 17 00:00:00 2001 From: Rebecca Taft Date: Fri, 14 Jan 2022 17:31:44 -0600 Subject: [PATCH] opt: fix bug in histogram calculation for TimeTZ This commit fixes a bug in the histogram estimation code for TimeTZ that made the faulty assumption that TimeTZ values are ordered by TimeOfDay. This is incorrect since it does not take the OffsetSecs into account. As a result, it was possible to estimate that the size of a histogram bucket was negative, which caused problems in the statistics estimation code. This commit fixes the problem by taking into account both TimeOfDay and OffsetSecs when estimating the size of a bucket in a TimeTZ histogram. Fixes #74667 Release note (bug fix): Fixed an internal error, "estimated row count must be non-zero", that could occur during planning for queries over a table with a TimeTZ column. This error was due to a faulty assumption in the statistics estimation code about ordering of TimeTZ values, which has now been fixed. The error could occur when TimeTZ values used in the query had a different time zone offset than the TimeTZ values stored in the table. --- pkg/sql/opt/memo/testdata/stats/groupby | 73 +++++++++++++++++++++++++ pkg/sql/opt/props/BUILD.bazel | 1 + pkg/sql/opt/props/histogram.go | 17 +++++- pkg/sql/opt/props/histogram_test.go | 70 +++++++++++++++++++++--- 4 files changed, 149 insertions(+), 12 deletions(-) diff --git a/pkg/sql/opt/memo/testdata/stats/groupby b/pkg/sql/opt/memo/testdata/stats/groupby index 08b6aae595e5..4ee1a6659b68 100644 --- a/pkg/sql/opt/memo/testdata/stats/groupby +++ b/pkg/sql/opt/memo/testdata/stats/groupby @@ -526,3 +526,76 @@ project │ └── bool_or:5 [type=bool, outer=(5), constraints=(/5: [/true - /true]; tight), fd=()-->(5)] └── projections └── 1 [as="?column?":6, type=int] + +# Regression test for #74667. +exec-ddl +CREATE TABLE t74667 (col TIMETZ PRIMARY KEY) +---- + +exec-ddl +ALTER TABLE t74667 INJECT STATISTICS '[ + { + "columns": [ + "col" + ], + "created_at": "2000-01-01 00:00:00+00:00", + "distinct_count": 950814763580487611, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 3873172219268179689, + "num_range": 0, + "upper_bound": "00:00:00+15:59:00" + }, + { + "distinct_range": 400000, + "num_eq": 3000000000, + "num_range": 400000, + "upper_bound": "04:40:23.558699+11:08:00" + }, + { + "distinct_range": 381143202295070850, + "num_eq": 6399369578112136136, + "num_range": 381143202295070816, + "upper_bound": "06:12:15.740051+06:40:00" + } + ], + "histo_col_type": "TIMETZ", + "name": "__auto__", + "null_count": 0, + "row_count": 1188522479222658429 + } +]' +---- + +norm +SELECT count(*) +FROM t74667 +WHERE col < '03:33:05.598931+07:11:00':::TIMETZ +GROUP BY col; +---- +project + ├── columns: count:4(int!null) + ├── stats: [rows=2.00509067e+16] + └── group-by (hash) + ├── columns: col:1(timetz!null) count_rows:4(int!null) + ├── grouping columns: col:1(timetz!null) + ├── stats: [rows=2.00509067e+16, distinct(1)=2.00509067e+16, null(1)=0] + ├── key: (1) + ├── fd: (1)-->(4) + ├── select + │ ├── columns: col:1(timetz!null) + │ ├── stats: [rows=4.52141047e+17, distinct(1)=2.00509067e+16, null(1)=0] + │ │ histogram(1)= 0 4.3209e+17 44624 3.3468e+08 2.0051e+16 0 + │ │ <--- '00:00:00+15:59:00' ------- '04:40:23.558699+11:08:00' ------------ '03:33:06.598931+07:11:01' + │ ├── key: (1) + │ ├── scan t74667 + │ │ ├── columns: col:1(timetz!null) + │ │ ├── stats: [rows=1.18852248e+18, distinct(1)=9.50814764e+17, null(1)=0] + │ │ │ histogram(1)= 0 4.3209e+17 44624 3.3468e+08 4.252e+16 7.1391e+17 + │ │ │ <--- '00:00:00+15:59:00' ------- '04:40:23.558699+11:08:00' ----------- '06:12:15.740051+06:40:00' + │ │ └── key: (1) + │ └── filters + │ └── col:1 < '03:33:05.598931+07:11:00' [type=bool, outer=(1), constraints=(/1: (/NULL - /'03:33:06.598931+07:11:01']; tight)] + └── aggregations + └── count-rows [as=count_rows:4, type=int] diff --git a/pkg/sql/opt/props/BUILD.bazel b/pkg/sql/opt/props/BUILD.bazel index 15c27ec51249..58a90d689b28 100644 --- a/pkg/sql/opt/props/BUILD.bazel +++ b/pkg/sql/opt/props/BUILD.bazel @@ -25,6 +25,7 @@ go_library( "//pkg/sql/types", "//pkg/util/encoding", "//pkg/util/log", + "//pkg/util/timetz", "@com_github_cockroachdb_errors//:errors", "@com_github_olekukonko_tablewriter//:tablewriter", ], diff --git a/pkg/sql/opt/props/histogram.go b/pkg/sql/opt/props/histogram.go index 80a4704df2a8..3f3be3429198 100644 --- a/pkg/sql/opt/props/histogram.go +++ b/pkg/sql/opt/props/histogram.go @@ -17,6 +17,7 @@ import ( "io" "math" "sort" + "time" "github.com/cockroachdb/cockroach/pkg/sql/inverted" "github.com/cockroachdb/cockroach/pkg/sql/opt" @@ -26,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/types" "github.com/cockroachdb/cockroach/pkg/util/encoding" + "github.com/cockroachdb/cockroach/pkg/util/timetz" "github.com/cockroachdb/errors" "github.com/olekukonko/tablewriter" ) @@ -785,9 +787,18 @@ func getRangesBeforeAndAfter( return rng, true case types.TimeTZFamily: - lower := lowerBound.(*tree.DTimeTZ).TimeOfDay - upper := upperBound.(*tree.DTimeTZ).TimeOfDay - rng = float64(upper) - float64(lower) + // timeTZOffsetSecsRange is the total number of possible values for offset. + timeTZOffsetSecsRange := timetz.MaxTimeTZOffsetSecs - timetz.MinTimeTZOffsetSecs + 1 + + // Find the range in microseconds based on the absolute times of the range + // boundaries. + lower := lowerBound.(*tree.DTimeTZ) + upper := upperBound.(*tree.DTimeTZ) + rng = float64(upper.ToTime().Sub(lower.ToTime()) / time.Microsecond) + + // Account for the offset. + rng *= float64(timeTZOffsetSecsRange) + rng += float64(upper.OffsetSecs - lower.OffsetSecs) return rng, true default: diff --git a/pkg/sql/opt/props/histogram_test.go b/pkg/sql/opt/props/histogram_test.go index eca72b201a3b..41fb081741b3 100644 --- a/pkg/sql/opt/props/histogram_test.go +++ b/pkg/sql/opt/props/histogram_test.go @@ -384,7 +384,7 @@ func TestFilterBucket(t *testing.T) { t.Fatalf("for span %s expected an error", testCase.span) } if !reflect.DeepEqual(testCase.expected, actual) { - t.Fatalf("for span %s exected %v but found %v", testCase.span, testCase.expected, actual) + t.Errorf("for span %s exected %v but found %v", testCase.span, testCase.expected, actual) } } } @@ -612,17 +612,17 @@ func TestFilterBucket(t *testing.T) { }) t.Run("timetz", func(t *testing.T) { - upperBound, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00", time.Microsecond) + upperBound1, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00", time.Microsecond) if err != nil { t.Fatal(err) } - lowerBound, _, err := tree.ParseDTimeTZ(&evalCtx, "04:00:00", time.Microsecond) + lowerBound1, _, err := tree.ParseDTimeTZ(&evalCtx, "04:00:00", time.Microsecond) if err != nil { t.Fatal(err) } - h := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{ - {NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(lowerBound)}, - {NumEq: 1, NumRange: 62, DistinctRange: 31, UpperBound: upperBound}, + h1 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{ + {NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(lowerBound1)}, + {NumEq: 1, NumRange: 62, DistinctRange: 31, UpperBound: upperBound1}, }} ub1, _, err := tree.ParseDTimeTZ(&evalCtx, "04:15:00", time.Microsecond) @@ -630,18 +630,70 @@ func TestFilterBucket(t *testing.T) { t.Fatal(err) } - testData := []testCase{ + testData1 := []testCase{ { span: "[/04:00:00 - /04:15:00]", expected: &cat.HistogramBucket{NumEq: 0, NumRange: 15.5, DistinctRange: 7.75, UpperBound: ub1}, }, { span: "[/04:30:00 - /05:00:00]", - expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound}, + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1}, + }, + { + span: "[/06:30:00+02:00:00 - /05:00:00]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1}, + }, + { + span: "[/08:30:00+04:00:00 - /05:00:00]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1}, + }, + } + + // This test distinguishes between values that have the same time but + // different offsets. + upperBound2, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00.000001", time.Microsecond) + if err != nil { + t.Fatal(err) + } + h2 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{ + {NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: upperBound1}, + {NumEq: 1, NumRange: 10000, DistinctRange: 1000, UpperBound: upperBound2}, + }} + + ub2, _, err := tree.ParseDTimeTZ(&evalCtx, "20:59:00.000001+15:59:00", time.Microsecond) + if err != nil { + t.Fatal(err) + } + + testData2 := []testCase{ + { + span: "[/05:00:00.000001 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 0, DistinctRange: 0, UpperBound: upperBound2}, + }, + { + span: "[/07:00:00.000001+02:00:00 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 625.65, DistinctRange: 62.57, UpperBound: upperBound2}, + }, + { + span: "[/09:00:00.000001+04:00:00 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 1251.3, DistinctRange: 125.13, UpperBound: upperBound2}, + }, + { + span: "[/04:59:59-00:00:01 - /20:59:00.000001+15:59:00]", + expected: &cat.HistogramBucket{NumEq: 0, NumRange: 5000, DistinctRange: 500, UpperBound: ub2}, + }, + { + span: "[/20:59:00.000001+15:59:00 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 5000, DistinctRange: 500, UpperBound: upperBound2}, + }, + { + span: "[/04:59:58-00:00:02 - /05:00:00.000001]", + expected: &cat.HistogramBucket{NumEq: 1, NumRange: 9999.91, DistinctRange: 999.99, UpperBound: upperBound2}, }, } - runTest(h, testData, types.TimeTZFamily) + runTest(h1, testData1, types.TimeTZFamily) + runTest(h2, testData2, types.TimeTZFamily) }) t.Run("string", func(t *testing.T) {