From 0f9140f597b6ad1c6bd89071436deaa98554284e Mon Sep 17 00:00:00 2001 From: Marcus Gartner Date: Mon, 26 Jul 2021 13:31:42 -0400 Subject: [PATCH] colfetcher: fix NULL checks during unique index decoding `colfetcher` must detect `NULL` values in unique secondary index keys on tables with multiple column families in order to determine whether consecutive KVs belongs to the same row or different rows. Previously, only the columns in the key that were needed by the query were decoded and checked for `NULL`. This caused incorrect query results when `NULL` column values were not detected because those columns were not needed. This commit fixes the issue by checking all columns for `NULL` when decoding unique secondary index keys on tables with multiple column families. Fixes #66706 Release note (bug fix): A bug has been fix that caused incorrect query results when querying tables with multiple column families and unique secondary indexes. The bug only occurred if 1) vectorized execution was enabled for the query, 2) the query scanned a unique secondary index that contained columns from more than one column family, and 3) rows fetched by the query contained NULL values for some of the indexed columns. This bug was present since version 20.1. --- pkg/sql/colencoding/key_encoding.go | 25 ++++++++++-- pkg/sql/colfetcher/cfetcher.go | 39 ++++++++++++++----- .../logictest/testdata/logic_test/vectorize | 25 ++++++++++++ 3 files changed, 75 insertions(+), 14 deletions(-) diff --git a/pkg/sql/colencoding/key_encoding.go b/pkg/sql/colencoding/key_encoding.go index a069d7756f1e..80e5f538bb6e 100644 --- a/pkg/sql/colencoding/key_encoding.go +++ b/pkg/sql/colencoding/key_encoding.go @@ -35,6 +35,12 @@ import ( // "seek prefix": the next key that might be part of the table being searched // for. The input key will also be mutated if matches is false. See the analog // in sqlbase/index_encoding.go. +// +// Sometimes it is necessary to determine if the value of a column is NULL even +// though the value itself is not needed. If checkAllColsForNull is true, then +// foundNull=true will be returned if any columns in the key are NULL, +// regardless of whether or not indexColIdx indicates that the column should be +// decoded. func DecodeIndexKeyToCols( da *rowenc.DatumAlloc, vecs []coldata.Vec, @@ -42,6 +48,7 @@ func DecodeIndexKeyToCols( desc catalog.TableDescriptor, index *descpb.IndexDescriptor, indexColIdx []int, + checkAllColsForNull bool, types []*types.T, colDirs []descpb.IndexDescriptor_Direction, key roachpb.Key, @@ -76,7 +83,7 @@ func DecodeIndexKeyToCols( // it is a interleaving ancestor. var isNull bool key, isNull, err = DecodeKeyValsToCols( - da, vecs, idx, indexColIdx[:length], types[:length], + da, vecs, idx, indexColIdx[:length], checkAllColsForNull, types[:length], colDirs[:length], nil /* unseen */, key, invertedColIdx, ) if err != nil { @@ -112,7 +119,7 @@ func DecodeIndexKeyToCols( var isNull bool key, isNull, err = DecodeKeyValsToCols( - da, vecs, idx, indexColIdx, types, colDirs, nil /* unseen */, key, invertedColIdx, + da, vecs, idx, indexColIdx, checkAllColsForNull, types, colDirs, nil /* unseen */, key, invertedColIdx, ) if err != nil { return nil, false, false, err @@ -140,18 +147,24 @@ func DecodeIndexKeyToCols( // have been observed during decoding. // See the analog in sqlbase/index_encoding.go. // DecodeKeyValsToCols additionally returns whether a NULL was encountered when decoding. +// +// Sometimes it is necessary to determine if the value of a column is NULL even +// though the value itself is not needed. If checkAllColsForNull is true, then +// foundNull=true will be returned if any columns in the key are NULL, +// regardless of whether or not indexColIdx indicates that the column should be +// decoded. func DecodeKeyValsToCols( da *rowenc.DatumAlloc, vecs []coldata.Vec, idx int, indexColIdx []int, + checkAllColsForNull bool, types []*types.T, directions []descpb.IndexDescriptor_Direction, unseen *util.FastIntSet, key []byte, invertedColIdx int, -) ([]byte, bool, error) { - foundNull := false +) (remainingKey []byte, foundNull bool, _ error) { for j := range types { enc := descpb.IndexDescriptor_ASC if directions != nil { @@ -160,6 +173,10 @@ func DecodeKeyValsToCols( var err error i := indexColIdx[j] if i == -1 { + if checkAllColsForNull { + isNull := encoding.PeekType(key) == encoding.Null + foundNull = foundNull || isNull + } // Don't need the coldata - skip it. key, err = rowenc.SkipTableKey(key) } else { diff --git a/pkg/sql/colfetcher/cfetcher.go b/pkg/sql/colfetcher/cfetcher.go index 7ad74f170f27..e2d96ba3c1e1 100644 --- a/pkg/sql/colfetcher/cfetcher.go +++ b/pkg/sql/colfetcher/cfetcher.go @@ -858,6 +858,11 @@ func (rf *cFetcher) nextBatch(ctx context.Context) (coldata.Batch, error) { if rf.traceKV { indexOrds = rf.table.allIndexColOrdinals } + // For unique secondary indexes on tables with multiple column + // families, we must check all columns for NULL values in order + // to determine whether a KV belongs to the same row as the + // previous KV or a different row. + checkAllColsForNull := rf.table.isSecondaryIndex && rf.table.index.Unique && rf.table.desc.NumFamilies() != 1 key, matches, foundNull, err = colencoding.DecodeIndexKeyToCols( &rf.table.da, rf.machine.colvecs, @@ -865,6 +870,7 @@ func (rf *cFetcher) nextBatch(ctx context.Context) (coldata.Batch, error) { rf.table.desc, rf.table.index, indexOrds, + checkAllColsForNull, rf.table.keyValTypes, rf.table.indexColumnDirs, rf.machine.nextKV.Key[rf.table.knownPrefixLength:], @@ -897,23 +903,31 @@ func (rf *cFetcher) nextBatch(ctx context.Context) (coldata.Batch, error) { rf.machine.lastRowPrefix = rf.machine.nextKV.Key[:prefixLen] } - // For unique secondary indexes, the index-key does not distinguish one row - // from the next if both rows contain identical values along with a NULL. + // For unique secondary indexes on tables with multiple column + // families, the index-key does not distinguish one row from the + // next if both rows contain identical values along with a NULL. // Consider the keys: // // /test/unique_idx/NULL/0 // /test/unique_idx/NULL/1 // - // The index-key extracted from the above keys is /test/unique_idx/NULL. The - // trailing /0 and /1 are the primary key used to unique-ify the keys when a - // NULL is present. When a null is present in the index key, we cut off more - // of the index key so that the prefix includes the primary key columns. + // The index-key extracted from the above keys is + // /test/unique_idx/NULL. The trailing /0 and /1 are the primary key + // used to unique-ify the keys when a NULL is present. When a null + // is present in the index key, we include the primary key columns + // in lastRowPrefix. // - // Note that we do not need to do this for non-unique secondary indexes because - // the extra columns in the primary key will _always_ be there, so we can decode - // them when processing the index. The difference with unique secondary indexes - // is that the extra columns are not always there, and are used to unique-ify + // Note that we do not need to do this for non-unique secondary + // indexes because the extra columns in the primary key will + // _always_ be there, so we can decode them when processing the + // index. The difference with unique secondary indexes is that the + // extra columns are not always there, and are used to unique-ify // the index key, rather than provide the primary key column values. + // + // We also do not need to do this when a table has only one column + // family because it is guaranteed that there is only one KV per + // row. We entirely skip the check that determines if the row is + // unfinished. if foundNull && rf.table.isSecondaryIndex && rf.table.index.Unique && rf.table.desc.NumFamilies() != 1 { // We get the remaining bytes after the computed prefix, and then // slice off the extra encoded columns from those bytes. We calculate @@ -950,11 +964,15 @@ func (rf *cFetcher) nextBatch(ctx context.Context) (coldata.Batch, error) { if rf.table.rowLastModified.Less(rf.machine.nextKV.Value.Timestamp) { rf.table.rowLastModified = rf.machine.nextKV.Value.Timestamp } + // If the table has only one column family, then the next KV will + // always belong to a different row than the current KV. if rf.table.desc.NumFamilies() == 1 { rf.machine.state[0] = stateFinalizeRow rf.machine.state[1] = stateInitFetch continue } + // If the table has more than one column family, then the next KV + // may belong to the same row as the current KV. rf.machine.state[0] = stateFetchNextKVWithUnfinishedRow case stateSeekPrefix: for { @@ -1231,6 +1249,7 @@ func (rf *cFetcher) processValue( rf.machine.colvecs, rf.machine.rowIdx, extraColOrds, + false, /* checkAllColsForNull */ table.extraTypes, nil, &rf.machine.remainingValueColsByIdx, diff --git a/pkg/sql/logictest/testdata/logic_test/vectorize b/pkg/sql/logictest/testdata/logic_test/vectorize index 926a0dcff789..48d5cb58b503 100644 --- a/pkg/sql/logictest/testdata/logic_test/vectorize +++ b/pkg/sql/logictest/testdata/logic_test/vectorize @@ -1170,3 +1170,28 @@ statement ok CREATE TYPE greeting AS ENUM ('hello'); CREATE TABLE greeting_table (x greeting); EXPLAIN (VEC) SELECT * FROM greeting_table; + +# Regression test for #66706. Ensure that cfetcher can correctly determine when +# consecutive KVs in a unique secondary index belong to the same row. +statement ok +SET vectorize=experimental_always + +statement ok +CREATE TABLE t66706 ( + a STRING, + b STRING, + UNIQUE INDEX u (b, a), + FAMILY (a), + FAMILY (b) +) + +statement ok +INSERT INTO t66706 VALUES + (NULL, 'bar'), + (NULL, 'bar') + +query T +SELECT b FROM t66706@u WHERE NOT (b = 'foo') +---- +bar +bar