From c3d4b166a1991b983cdd0938d2cd0fdb888fdf55 Mon Sep 17 00:00:00 2001
From: Rebecca Taft <becca@cockroachlabs.com>
Date: Fri, 14 Jan 2022 17:31:44 -0600
Subject: [PATCH] opt: fix bug in histogram calculation for TimeTZ

This commit fixes a bug in the histogram estimation code for TimeTZ
that made the faulty assumption that TimeTZ values are ordered by
TimeOfDay. This is incorrect since it does not take the OffsetSecs
into account. As a result, it was possible to estimate that the size
of a histogram bucket was negative, which caused problems in the
statistics estimation code. This commit fixes the problem by taking
into account both TimeOfDay and OffsetSecs when estimating the size of
a bucket in a TimeTZ histogram.

Fixes #74667

Release note (bug fix): Fixed an internal error, "estimated row count must
be non-zero", that could occur during planning for queries over a table
with a TimeTZ column. This error was due to a faulty assumption in the
statistics estimation code about ordering of TimeTZ values, which has now
been fixed. The error could occur when TimeTZ values used in the query had
a different time zone offset than the TimeTZ values stored in the table.
---
 pkg/sql/opt/memo/testdata/stats/groupby | 73 +++++++++++++++++++++++++
 pkg/sql/opt/props/BUILD.bazel           |  1 +
 pkg/sql/opt/props/histogram.go          | 17 +++++-
 pkg/sql/opt/props/histogram_test.go     | 70 +++++++++++++++++++++---
 4 files changed, 149 insertions(+), 12 deletions(-)

diff --git a/pkg/sql/opt/memo/testdata/stats/groupby b/pkg/sql/opt/memo/testdata/stats/groupby
index 08b6aae595e5..4ee1a6659b68 100644
--- a/pkg/sql/opt/memo/testdata/stats/groupby
+++ b/pkg/sql/opt/memo/testdata/stats/groupby
@@ -526,3 +526,76 @@ project
  │         └── bool_or:5 [type=bool, outer=(5), constraints=(/5: [/true - /true]; tight), fd=()-->(5)]
  └── projections
       └── 1 [as="?column?":6, type=int]
+
+# Regression test for #74667.
+exec-ddl
+CREATE TABLE t74667 (col TIMETZ PRIMARY KEY)
+----
+
+exec-ddl
+ALTER TABLE t74667 INJECT STATISTICS '[
+    {
+        "columns": [
+            "col"
+        ],
+        "created_at": "2000-01-01 00:00:00+00:00",
+        "distinct_count": 950814763580487611,
+        "histo_buckets": [
+            {
+                "distinct_range": 0,
+                "num_eq": 3873172219268179689,
+                "num_range": 0,
+                "upper_bound": "00:00:00+15:59:00"
+            },
+            {
+                "distinct_range": 400000,
+                "num_eq": 3000000000,
+                "num_range": 400000,
+                "upper_bound": "04:40:23.558699+11:08:00"
+            },
+            {
+                "distinct_range": 381143202295070850,
+                "num_eq": 6399369578112136136,
+                "num_range": 381143202295070816,
+                "upper_bound": "06:12:15.740051+06:40:00"
+            }
+        ],
+        "histo_col_type": "TIMETZ",
+        "name": "__auto__",
+        "null_count": 0,
+        "row_count": 1188522479222658429
+    }
+]'
+----
+
+norm
+SELECT count(*)
+FROM t74667
+WHERE col < '03:33:05.598931+07:11:00':::TIMETZ
+GROUP BY col;
+----
+project
+ ├── columns: count:4(int!null)
+ ├── stats: [rows=2.00509067e+16]
+ └── group-by (hash)
+      ├── columns: col:1(timetz!null) count_rows:4(int!null)
+      ├── grouping columns: col:1(timetz!null)
+      ├── stats: [rows=2.00509067e+16, distinct(1)=2.00509067e+16, null(1)=0]
+      ├── key: (1)
+      ├── fd: (1)-->(4)
+      ├── select
+      │    ├── columns: col:1(timetz!null)
+      │    ├── stats: [rows=4.52141047e+17, distinct(1)=2.00509067e+16, null(1)=0]
+      │    │   histogram(1)=  0      4.3209e+17       44624          3.3468e+08          2.0051e+16              0
+      │    │                <--- '00:00:00+15:59:00' ------- '04:40:23.558699+11:08:00' ------------ '03:33:06.598931+07:11:01'
+      │    ├── key: (1)
+      │    ├── scan t74667
+      │    │    ├── columns: col:1(timetz!null)
+      │    │    ├── stats: [rows=1.18852248e+18, distinct(1)=9.50814764e+17, null(1)=0]
+      │    │    │   histogram(1)=  0      4.3209e+17       44624          3.3468e+08          4.252e+16          7.1391e+17
+      │    │    │                <--- '00:00:00+15:59:00' ------- '04:40:23.558699+11:08:00' ----------- '06:12:15.740051+06:40:00'
+      │    │    └── key: (1)
+      │    └── filters
+      │         └── col:1 < '03:33:05.598931+07:11:00' [type=bool, outer=(1), constraints=(/1: (/NULL - /'03:33:06.598931+07:11:01']; tight)]
+      └── aggregations
+           └── count-rows [as=count_rows:4, type=int]
diff --git a/pkg/sql/opt/props/BUILD.bazel b/pkg/sql/opt/props/BUILD.bazel
index 15c27ec51249..58a90d689b28 100644
--- a/pkg/sql/opt/props/BUILD.bazel
+++ b/pkg/sql/opt/props/BUILD.bazel
@@ -25,6 +25,7 @@ go_library(
         "//pkg/sql/types",
         "//pkg/util/encoding",
         "//pkg/util/log",
+        "//pkg/util/timetz",
         "@com_github_cockroachdb_errors//:errors",
         "@com_github_olekukonko_tablewriter//:tablewriter",
     ],
diff --git a/pkg/sql/opt/props/histogram.go b/pkg/sql/opt/props/histogram.go
index 80a4704df2a8..3f3be3429198 100644
--- a/pkg/sql/opt/props/histogram.go
+++ b/pkg/sql/opt/props/histogram.go
@@ -17,6 +17,7 @@ import (
 	"io"
 	"math"
 	"sort"
+	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/sql/inverted"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt"
@@ -26,6 +27,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/sql/types"
 	"github.com/cockroachdb/cockroach/pkg/util/encoding"
+	"github.com/cockroachdb/cockroach/pkg/util/timetz"
 	"github.com/cockroachdb/errors"
 	"github.com/olekukonko/tablewriter"
 )
@@ -785,9 +787,18 @@ func getRangesBeforeAndAfter(
 			return rng, true
 
 		case types.TimeTZFamily:
-			lower := lowerBound.(*tree.DTimeTZ).TimeOfDay
-			upper := upperBound.(*tree.DTimeTZ).TimeOfDay
-			rng = float64(upper) - float64(lower)
+			// timeTZOffsetSecsRange is the total number of possible values for offset.
+			timeTZOffsetSecsRange := timetz.MaxTimeTZOffsetSecs - timetz.MinTimeTZOffsetSecs + 1
+
+			// Find the range in microseconds based on the absolute times of the range
+			// boundaries.
+			lower := lowerBound.(*tree.DTimeTZ)
+			upper := upperBound.(*tree.DTimeTZ)
+			rng = float64(upper.ToTime().Sub(lower.ToTime()) / time.Microsecond)
+
+			// Account for the offset.
+			rng *= float64(timeTZOffsetSecsRange)
+			rng += float64(upper.OffsetSecs - lower.OffsetSecs)
 			return rng, true
 
 		default:
diff --git a/pkg/sql/opt/props/histogram_test.go b/pkg/sql/opt/props/histogram_test.go
index eca72b201a3b..41fb081741b3 100644
--- a/pkg/sql/opt/props/histogram_test.go
+++ b/pkg/sql/opt/props/histogram_test.go
@@ -384,7 +384,7 @@ func TestFilterBucket(t *testing.T) {
 						t.Fatalf("for span %s expected an error", testCase.span)
 					}
 					if !reflect.DeepEqual(testCase.expected, actual) {
-						t.Fatalf("for span %s exected %v but found %v", testCase.span, testCase.expected, actual)
+						t.Errorf("for span %s exected %v but found %v", testCase.span, testCase.expected, actual)
 					}
 				}
 			}
@@ -612,17 +612,17 @@ func TestFilterBucket(t *testing.T) {
 	})
 
 	t.Run("timetz", func(t *testing.T) {
-		upperBound, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00", time.Microsecond)
+		upperBound1, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00", time.Microsecond)
 		if err != nil {
 			t.Fatal(err)
 		}
-		lowerBound, _, err := tree.ParseDTimeTZ(&evalCtx, "04:00:00", time.Microsecond)
+		lowerBound1, _, err := tree.ParseDTimeTZ(&evalCtx, "04:00:00", time.Microsecond)
 		if err != nil {
 			t.Fatal(err)
 		}
-		h := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
-			{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(lowerBound)},
-			{NumEq: 1, NumRange: 62, DistinctRange: 31, UpperBound: upperBound},
+		h1 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
+			{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: getPrevUpperBound(lowerBound1)},
+			{NumEq: 1, NumRange: 62, DistinctRange: 31, UpperBound: upperBound1},
 		}}
 
 		ub1, _, err := tree.ParseDTimeTZ(&evalCtx, "04:15:00", time.Microsecond)
@@ -630,18 +630,70 @@ func TestFilterBucket(t *testing.T) {
 			t.Fatal(err)
 		}
 
-		testData := []testCase{
+		testData1 := []testCase{
 			{
 				span:     "[/04:00:00 - /04:15:00]",
 				expected: &cat.HistogramBucket{NumEq: 0, NumRange: 15.5, DistinctRange: 7.75, UpperBound: ub1},
 			},
 			{
 				span:     "[/04:30:00 - /05:00:00]",
-				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound},
+				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1},
+			},
+			{
+				span:     "[/06:30:00+02:00:00 - /05:00:00]",
+				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1},
+			},
+			{
+				span:     "[/08:30:00+04:00:00 - /05:00:00]",
+				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 31, DistinctRange: 15.5, UpperBound: upperBound1},
+			},
+		}
+
+		// This test distinguishes between values that have the same time but
+		// different offsets.
+		upperBound2, _, err := tree.ParseDTimeTZ(&evalCtx, "05:00:00.000001", time.Microsecond)
+		if err != nil {
+			t.Fatal(err)
+		}
+		h2 := &Histogram{evalCtx: &evalCtx, col: col, buckets: []cat.HistogramBucket{
+			{NumEq: 0, NumRange: 0, DistinctRange: 0, UpperBound: upperBound1},
+			{NumEq: 1, NumRange: 10000, DistinctRange: 1000, UpperBound: upperBound2},
+		}}
+
+		ub2, _, err := tree.ParseDTimeTZ(&evalCtx, "20:59:00.000001+15:59:00", time.Microsecond)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		testData2 := []testCase{
+			{
+				span:     "[/05:00:00.000001 - /05:00:00.000001]",
+				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 0, DistinctRange: 0, UpperBound: upperBound2},
+			},
+			{
+				span:     "[/07:00:00.000001+02:00:00 - /05:00:00.000001]",
+				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 625.65, DistinctRange: 62.57, UpperBound: upperBound2},
+			},
+			{
+				span:     "[/09:00:00.000001+04:00:00 - /05:00:00.000001]",
+				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 1251.3, DistinctRange: 125.13, UpperBound: upperBound2},
+			},
+			{
+				span:     "[/04:59:59-00:00:01 - /20:59:00.000001+15:59:00]",
+				expected: &cat.HistogramBucket{NumEq: 0, NumRange: 5000, DistinctRange: 500, UpperBound: ub2},
+			},
+			{
+				span:     "[/20:59:00.000001+15:59:00 - /05:00:00.000001]",
+				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 5000, DistinctRange: 500, UpperBound: upperBound2},
+			},
+			{
+				span:     "[/04:59:58-00:00:02 - /05:00:00.000001]",
+				expected: &cat.HistogramBucket{NumEq: 1, NumRange: 9999.91, DistinctRange: 999.99, UpperBound: upperBound2},
 			},
 		}
 
-		runTest(h, testData, types.TimeTZFamily)
+		runTest(h1, testData1, types.TimeTZFamily)
+		runTest(h2, testData2, types.TimeTZFamily)
 	})
 
 	t.Run("string", func(t *testing.T) {