From 88d2aba9abb4cae380a317e3861ecb097f1d3fca Mon Sep 17 00:00:00 2001 From: Marcus Gartner Date: Tue, 31 Aug 2021 13:02:42 -0400 Subject: [PATCH] colexec: fix IN operator with unsorted tuple The vectorized implementation of an `element IN tuple` expression assumes that the contents of `tuple` are sorted by the optimizer. Based on this assumption, it performs a binary search instead of a linear search. However, the assumption that the optimizer sorts all tuples is incorrect. For example, there are cases where the contents of a tuple are not known at planning-time, so the tuple cannot be sorted. Performing a binary search with an unsorted tuple causes incorrect query results. Now, the vectorized engine sorts tuple contents if they are not already sorted. Fixes #68979 Release justification: This commit fixes a bug with the IN operator that causes incorrect results. Release note (bug fix): A bug has been fixed which caused incorrect evaluation of the `IN` operator when the tuple on the right-hand-side of the operator included a subquery, like `a IN ('foo', (SELECT s FROM t), 'bar')`. --- .../execgen/cmd/execgen/select_in_gen.go | 4 +- pkg/sql/colexec/select_in.eg.go | 439 +++++++++++++++++- pkg/sql/colexec/select_in_test.go | 14 + pkg/sql/colexec/select_in_tmpl.go | 35 +- .../logictest/testdata/logic_test/vectorize | 15 + 5 files changed, 481 insertions(+), 26 deletions(-) diff --git a/pkg/sql/colexec/execgen/cmd/execgen/select_in_gen.go b/pkg/sql/colexec/execgen/cmd/execgen/select_in_gen.go index b0e6e7587d11..50bae7a4f02f 100644 --- a/pkg/sql/colexec/execgen/cmd/execgen/select_in_gen.go +++ b/pkg/sql/colexec/execgen/cmd/execgen/select_in_gen.go @@ -31,8 +31,8 @@ func genSelectIn(inputFileContents string, wr io.Writer) error { ) s := r.Replace(inputFileContents) - assignEq := makeFunctionRegex("_COMPARE", 5) - s = assignEq.ReplaceAllString(s, makeTemplateFunctionCall("Compare", 5)) + compare := makeFunctionRegex("_COMPARE", 5) + s = compare.ReplaceAllString(s, makeTemplateFunctionCall("Compare", 5)) s = replaceManipulationFuncs(s) diff --git a/pkg/sql/colexec/select_in.eg.go b/pkg/sql/colexec/select_in.eg.go index cb6d6fd5040e..eac5850cd665 100644 --- a/pkg/sql/colexec/select_in.eg.go +++ b/pkg/sql/colexec/select_in.eg.go @@ -12,6 +12,7 @@ package colexec import ( "bytes" "math" + "sort" "time" "github.com/cockroachdb/apd/v2" @@ -352,6 +353,7 @@ type selectInOpBool struct { filterRow []bool hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpBool{} @@ -364,6 +366,7 @@ type projectInOpBool struct { filterRow []bool hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpBool{} @@ -384,11 +387,30 @@ func fillDatumRowBool(t *types.T, datumTuple *tree.DTuple) ([]bool, bool) { return result, hasNulls } +func sortDatumRowBool(filterRow []bool, targetCol coldata.Bools) { + less := func(i, j int) bool { + var cmpResult int + + if !filterRow[i] && filterRow[j] { + cmpResult = -1 + } else if filterRow[i] && !filterRow[j] { + cmpResult = 1 + } else { + cmpResult = 0 + } + + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInBool( targetElem bool, targetCol coldata.Bools, filterRow []bool, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowBool, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -431,6 +453,14 @@ func (si *selectInOpBool) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowBool because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowBool(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -501,6 +531,14 @@ func (pi *projectInOpBool) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Bool() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowBool because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowBool(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -586,6 +624,7 @@ type selectInOpBytes struct { filterRow [][]byte hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpBytes{} @@ -598,6 +637,7 @@ type projectInOpBytes struct { filterRow [][]byte hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpBytes{} @@ -618,11 +658,22 @@ func fillDatumRowBytes(t *types.T, datumTuple *tree.DTuple) ([][]byte, bool) { return result, hasNulls } +func sortDatumRowBytes(filterRow [][]byte, targetCol *coldata.Bytes) { + less := func(i, j int) bool { + var cmpResult int + cmpResult = bytes.Compare(filterRow[i], filterRow[j]) + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInBytes( targetElem []byte, targetCol *coldata.Bytes, filterRow [][]byte, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowBytes, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -657,6 +708,14 @@ func (si *selectInOpBytes) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowBytes because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowBytes(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -725,6 +784,14 @@ func (pi *projectInOpBytes) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Bytes() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowBytes because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowBytes(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -808,6 +875,7 @@ type selectInOpDecimal struct { filterRow []apd.Decimal hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpDecimal{} @@ -820,6 +888,7 @@ type projectInOpDecimal struct { filterRow []apd.Decimal hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpDecimal{} @@ -840,11 +909,22 @@ func fillDatumRowDecimal(t *types.T, datumTuple *tree.DTuple) ([]apd.Decimal, bo return result, hasNulls } +func sortDatumRowDecimal(filterRow []apd.Decimal, targetCol coldata.Decimals) { + less := func(i, j int) bool { + var cmpResult int + cmpResult = tree.CompareDecimals(&filterRow[i], &filterRow[j]) + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInDecimal( targetElem apd.Decimal, targetCol coldata.Decimals, filterRow []apd.Decimal, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowDecimal, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -879,6 +959,14 @@ func (si *selectInOpDecimal) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowDecimal because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowDecimal(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -949,6 +1037,14 @@ func (pi *projectInOpDecimal) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Decimal() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowDecimal because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowDecimal(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -1034,6 +1130,7 @@ type selectInOpInt16 struct { filterRow []int16 hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpInt16{} @@ -1046,6 +1143,7 @@ type projectInOpInt16 struct { filterRow []int16 hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpInt16{} @@ -1066,11 +1164,33 @@ func fillDatumRowInt16(t *types.T, datumTuple *tree.DTuple) ([]int16, bool) { return result, hasNulls } +func sortDatumRowInt16(filterRow []int16, targetCol coldata.Int16s) { + less := func(i, j int) bool { + var cmpResult int + + { + a, b := int64(filterRow[i]), int64(filterRow[j]) + if a < b { + cmpResult = -1 + } else if a > b { + cmpResult = 1 + } else { + cmpResult = 0 + } + } + + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInInt16( targetElem int16, targetCol coldata.Int16s, filterRow []int16, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowInt16, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -1116,6 +1236,14 @@ func (si *selectInOpInt16) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowInt16 because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowInt16(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -1186,6 +1314,14 @@ func (pi *projectInOpInt16) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Int16() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowInt16 because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowInt16(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -1271,6 +1407,7 @@ type selectInOpInt32 struct { filterRow []int32 hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpInt32{} @@ -1283,6 +1420,7 @@ type projectInOpInt32 struct { filterRow []int32 hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpInt32{} @@ -1303,11 +1441,33 @@ func fillDatumRowInt32(t *types.T, datumTuple *tree.DTuple) ([]int32, bool) { return result, hasNulls } +func sortDatumRowInt32(filterRow []int32, targetCol coldata.Int32s) { + less := func(i, j int) bool { + var cmpResult int + + { + a, b := int64(filterRow[i]), int64(filterRow[j]) + if a < b { + cmpResult = -1 + } else if a > b { + cmpResult = 1 + } else { + cmpResult = 0 + } + } + + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInInt32( targetElem int32, targetCol coldata.Int32s, filterRow []int32, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowInt32, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -1353,6 +1513,14 @@ func (si *selectInOpInt32) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowInt32 because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowInt32(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -1423,6 +1591,14 @@ func (pi *projectInOpInt32) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Int32() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowInt32 because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowInt32(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -1508,6 +1684,7 @@ type selectInOpInt64 struct { filterRow []int64 hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpInt64{} @@ -1520,6 +1697,7 @@ type projectInOpInt64 struct { filterRow []int64 hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpInt64{} @@ -1540,11 +1718,33 @@ func fillDatumRowInt64(t *types.T, datumTuple *tree.DTuple) ([]int64, bool) { return result, hasNulls } +func sortDatumRowInt64(filterRow []int64, targetCol coldata.Int64s) { + less := func(i, j int) bool { + var cmpResult int + + { + a, b := int64(filterRow[i]), int64(filterRow[j]) + if a < b { + cmpResult = -1 + } else if a > b { + cmpResult = 1 + } else { + cmpResult = 0 + } + } + + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInInt64( targetElem int64, targetCol coldata.Int64s, filterRow []int64, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowInt64, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -1590,6 +1790,14 @@ func (si *selectInOpInt64) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowInt64 because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowInt64(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -1660,6 +1868,14 @@ func (pi *projectInOpInt64) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Int64() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowInt64 because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowInt64(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -1745,6 +1961,7 @@ type selectInOpFloat64 struct { filterRow []float64 hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpFloat64{} @@ -1757,6 +1974,7 @@ type projectInOpFloat64 struct { filterRow []float64 hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpFloat64{} @@ -1777,11 +1995,41 @@ func fillDatumRowFloat64(t *types.T, datumTuple *tree.DTuple) ([]float64, bool) return result, hasNulls } +func sortDatumRowFloat64(filterRow []float64, targetCol coldata.Float64s) { + less := func(i, j int) bool { + var cmpResult int + + { + a, b := float64(filterRow[i]), float64(filterRow[j]) + if a < b { + cmpResult = -1 + } else if a > b { + cmpResult = 1 + } else if a == b { + cmpResult = 0 + } else if math.IsNaN(a) { + if math.IsNaN(b) { + cmpResult = 0 + } else { + cmpResult = -1 + } + } else { + cmpResult = 1 + } + } + + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInFloat64( targetElem float64, targetCol coldata.Float64s, filterRow []float64, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowFloat64, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -1835,6 +2083,14 @@ func (si *selectInOpFloat64) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowFloat64 because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowFloat64(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -1905,6 +2161,14 @@ func (pi *projectInOpFloat64) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Float64() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowFloat64 because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowFloat64(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -1990,6 +2254,7 @@ type selectInOpTimestamp struct { filterRow []time.Time hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpTimestamp{} @@ -2002,6 +2267,7 @@ type projectInOpTimestamp struct { filterRow []time.Time hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpTimestamp{} @@ -2022,11 +2288,29 @@ func fillDatumRowTimestamp(t *types.T, datumTuple *tree.DTuple) ([]time.Time, bo return result, hasNulls } +func sortDatumRowTimestamp(filterRow []time.Time, targetCol coldata.Times) { + less := func(i, j int) bool { + var cmpResult int + + if filterRow[i].Before(filterRow[j]) { + cmpResult = -1 + } else if filterRow[j].Before(filterRow[i]) { + cmpResult = 1 + } else { + cmpResult = 0 + } + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInTimestamp( targetElem time.Time, targetCol coldata.Times, filterRow []time.Time, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowTimestamp, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -2068,6 +2352,14 @@ func (si *selectInOpTimestamp) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowTimestamp because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowTimestamp(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -2138,6 +2430,14 @@ func (pi *projectInOpTimestamp) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Timestamp() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowTimestamp because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowTimestamp(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -2223,6 +2523,7 @@ type selectInOpInterval struct { filterRow []duration.Duration hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpInterval{} @@ -2235,6 +2536,7 @@ type projectInOpInterval struct { filterRow []duration.Duration hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpInterval{} @@ -2255,11 +2557,22 @@ func fillDatumRowInterval(t *types.T, datumTuple *tree.DTuple) ([]duration.Durat return result, hasNulls } +func sortDatumRowInterval(filterRow []duration.Duration, targetCol coldata.Durations) { + less := func(i, j int) bool { + var cmpResult int + cmpResult = filterRow[i].Compare(filterRow[j]) + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInInterval( targetElem duration.Duration, targetCol coldata.Durations, filterRow []duration.Duration, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowInterval, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -2294,6 +2607,14 @@ func (si *selectInOpInterval) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowInterval because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowInterval(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -2364,6 +2685,14 @@ func (pi *projectInOpInterval) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Interval() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowInterval because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowInterval(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -2449,6 +2778,7 @@ type selectInOpJSON struct { filterRow []json.JSON hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpJSON{} @@ -2461,6 +2791,7 @@ type projectInOpJSON struct { filterRow []json.JSON hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpJSON{} @@ -2481,11 +2812,28 @@ func fillDatumRowJSON(t *types.T, datumTuple *tree.DTuple) ([]json.JSON, bool) { return result, hasNulls } +func sortDatumRowJSON(filterRow []json.JSON, targetCol *coldata.JSONs) { + less := func(i, j int) bool { + var cmpResult int + + var err error + cmpResult, err = filterRow[i].Compare(filterRow[j]) + if err != nil { + colexecerror.ExpectedError(err) + } + + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInJSON( targetElem json.JSON, targetCol *coldata.JSONs, filterRow []json.JSON, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowJSON, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -2526,6 +2874,14 @@ func (si *selectInOpJSON) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowJSON because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowJSON(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -2594,6 +2950,14 @@ func (pi *projectInOpJSON) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.JSON() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowJSON because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowJSON(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() @@ -2677,6 +3041,7 @@ type selectInOpDatum struct { filterRow []interface{} hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOpDatum{} @@ -2689,6 +3054,7 @@ type projectInOpDatum struct { filterRow []interface{} hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOpDatum{} @@ -2709,11 +3075,24 @@ func fillDatumRowDatum(t *types.T, datumTuple *tree.DTuple) ([]interface{}, bool return result, hasNulls } +func sortDatumRowDatum(filterRow []interface{}, targetCol coldata.DatumVec) { + less := func(i, j int) bool { + var cmpResult int + + cmpResult = coldataext.CompareDatum(filterRow[i], targetCol, filterRow[j]) + + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpInDatum( targetElem interface{}, targetCol coldata.DatumVec, filterRow []interface{}, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRowDatum, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -2750,6 +3129,14 @@ func (si *selectInOpDatum) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRowDatum because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRowDatum(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -2818,6 +3205,14 @@ func (pi *projectInOpDatum) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.Datum() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRowDatum because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRowDatum(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() diff --git a/pkg/sql/colexec/select_in_test.go b/pkg/sql/colexec/select_in_test.go index 7535fdb1b9a6..d93d7d6eaacb 100644 --- a/pkg/sql/colexec/select_in_test.go +++ b/pkg/sql/colexec/select_in_test.go @@ -70,6 +70,14 @@ func TestSelectInInt64(t *testing.T) { hasNulls: true, negate: true, }, + { + desc: "In test with unordered filterRow", + inputTuples: colexectestutils.Tuples{{0}, {1}, {2}}, + outputTuples: colexectestutils.Tuples{{0}, {1}, {2}}, + filterRow: []int64{2, 0, 1}, + hasNulls: false, + negate: false, + }, } for _, c := range testCases { @@ -211,6 +219,12 @@ func TestProjectInInt64(t *testing.T) { outputTuples: colexectestutils.Tuples{{1, false}, {2, false}}, inClause: "IN (3)", }, + { + desc: "In test with unordered tuple", + inputTuples: colexectestutils.Tuples{{0}, {1}, {2}}, + outputTuples: colexectestutils.Tuples{{0, true}, {1, true}, {2, true}}, + inClause: "IN (2, 0, 1)", + }, } for _, c := range testCases { diff --git a/pkg/sql/colexec/select_in_tmpl.go b/pkg/sql/colexec/select_in_tmpl.go index 244070893ed8..3a29e375be8e 100644 --- a/pkg/sql/colexec/select_in_tmpl.go +++ b/pkg/sql/colexec/select_in_tmpl.go @@ -20,6 +20,8 @@ package colexec import ( + "sort" + "github.com/cockroachdb/apd/v2" "github.com/cockroachdb/cockroach/pkg/col/coldata" "github.com/cockroachdb/cockroach/pkg/col/coldataext" @@ -141,6 +143,7 @@ type selectInOp_TYPE struct { filterRow []_GOTYPE hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &selectInOp_TYPE{} @@ -153,6 +156,7 @@ type projectInOp_TYPE struct { filterRow []_GOTYPE hasNulls bool negate bool + sorted bool } var _ colexecop.Operator = &projectInOp_TYPE{} @@ -173,11 +177,22 @@ func fillDatumRow_TYPE(t *types.T, datumTuple *tree.DTuple) ([]_GOTYPE, bool) { return result, hasNulls } +func sortDatumRow_TYPE(filterRow []_GOTYPE, targetCol _GOTYPESLICE) { + less := func(i, j int) bool { + var cmpResult int + _COMPARE(cmpResult, filterRow[i], filterRow[j], targetCol, _) + return cmpResult < 0 + } + if !sort.SliceIsSorted(filterRow, less) { + sort.Slice(filterRow, less) + } +} + func cmpIn_TYPE( targetElem _GOTYPE, targetCol _GOTYPESLICE, filterRow []_GOTYPE, hasNulls bool, ) comparisonResult { - // Filter row input is already sorted due to normalization, so we can use a - // binary search right away. + // Filter row input was already sorted in sortDatumRow_TYPE, so we can + // perform a binary search. lo := 0 hi := len(filterRow) for lo < hi { @@ -212,6 +227,14 @@ func (si *selectInOp_TYPE) Next() coldata.Batch { var idx int n := batch.Length() + // Sort si.filterRow once. We perform the sort here instead of in + // fillDatumRow_TYPE because the compare overload requires the eval + // context of a coldata.DatumVec target column. + if !si.sorted { + sortDatumRow_TYPE(si.filterRow, col) + si.sorted = true + } + compVal := siTrue if si.negate { compVal = siFalse @@ -286,6 +309,14 @@ func (pi *projectInOp_TYPE) Next() coldata.Batch { vec := batch.ColVec(pi.colIdx) col := vec.TemplateType() + // Sort pi.filterRow once. We perform the sort here instead of in + // fillDatumRow_TYPE because the compare overload requires the eval context + // of a coldata.DatumVec target column. + if !pi.sorted { + sortDatumRow_TYPE(pi.filterRow, col) + pi.sorted = true + } + projVec := batch.ColVec(pi.outputIdx) projCol := projVec.Bool() projNulls := projVec.Nulls() diff --git a/pkg/sql/logictest/testdata/logic_test/vectorize b/pkg/sql/logictest/testdata/logic_test/vectorize index a1f5856c9570..ef6d84316a4d 100644 --- a/pkg/sql/logictest/testdata/logic_test/vectorize +++ b/pkg/sql/logictest/testdata/logic_test/vectorize @@ -1253,3 +1253,18 @@ query T SELECT c FROM t68040 WHERE c LIKE '%\\%' ---- string with \ backslash + +# Regression test for #68979. The IN operator should evaluate correctly when the +# tuple contents are not sorted by the optimizer. +statement ok +CREATE TABLE t68979 ( + a INT +) + +statement ok +INSERT INTO t68979 VALUES (0) + +query B +SELECT 'b' IN ('b', (SELECT NULL FROM t68979), 'a') FROM t68979 +---- +true