Skip to content

Commit

Permalink
Merge #79705
Browse files Browse the repository at this point in the history
79705: sql: implement trigram inverted indexes and index acceleration for % and LIKE r=jordanlewis a=jordanlewis

Updates #41285

See commits for details. This PR implements inverted indexing on string columns using trigrams and searching trigram indexes using the `=`, `LIKE`, `ILIKE` and `%` (text similarity) operators.

Inverted indexes on text columns are built with the `gin_trgm_ops` or `gist_trgm_ops` opclasses, with `CREATE INDEX ON t USING GIN(col gin_trgm_ops)`, `CREATE INVERTED INDEX ON t(col gin_tgrm_ops)`, or `CREATE INDEX ON t USING GIST(col gist_trgm_ops)`. The `gin` and `gist` opclasses are currently implemented identically.


Co-authored-by: Jordan Lewis <[email protected]>
  • Loading branch information
craig[bot] and jordanlewis committed Jun 7, 2022
2 parents 729b075 + b9b8033 commit 5bd3fdd
Show file tree
Hide file tree
Showing 36 changed files with 1,196 additions and 82 deletions.
2 changes: 1 addition & 1 deletion docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -282,4 +282,4 @@ trace.jaeger.agent string the address of a Jaeger agent to receive traces using
trace.opentelemetry.collector string address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as <host>:<port>. If no port is specified, 4317 will be used.
trace.span_registry.enabled boolean true if set, ongoing traces can be seen at https://<ui>/#/debug/tracez
trace.zipkin.collector string the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.
version version 22.1-8 set the active cluster version in the format '<major>.<minor>'
version version 22.1-10 set the active cluster version in the format '<major>.<minor>'
2 changes: 1 addition & 1 deletion docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,6 @@
<tr><td><code>trace.opentelemetry.collector</code></td><td>string</td><td><code></code></td><td>address of an OpenTelemetry trace collector to receive traces using the otel gRPC protocol, as <host>:<port>. If no port is specified, 4317 will be used.</td></tr>
<tr><td><code>trace.span_registry.enabled</code></td><td>boolean</td><td><code>true</code></td><td>if set, ongoing traces can be seen at https://<ui>/#/debug/tracez</td></tr>
<tr><td><code>trace.zipkin.collector</code></td><td>string</td><td><code></code></td><td>the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used.</td></tr>
<tr><td><code>version</code></td><td>version</td><td><code>22.1-8</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
<tr><td><code>version</code></td><td>version</td><td><code>22.1-10</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
</tbody>
</table>
8 changes: 8 additions & 0 deletions pkg/clusterversion/cockroach_versions.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,10 @@ const (
// keys at the Pebble layer.
EnablePebbleFormatVersionRangeKeys

// TrigramInvertedIndexes enables the creation of trigram inverted indexes
// on strings.
TrigramInvertedIndexes

// *************************************************
// Step (1): Add new versions here.
// Do not add new versions to a patch release.
Expand Down Expand Up @@ -638,6 +642,10 @@ var versionsSingleton = keyedVersions{
Key: EnablePebbleFormatVersionRangeKeys,
Version: roachpb.Version{Major: 22, Minor: 1, Internal: 8},
},
{
Key: TrigramInvertedIndexes,
Version: roachpb.Version{Major: 22, Minor: 1, Internal: 10},
},

// *************************************************
// Step (2): Add new versions here.
Expand Down
5 changes: 3 additions & 2 deletions pkg/clusterversion/key_string.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pkg/internal/sqlsmith/alter.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ func makeCreateIndex(s *Smither) (tree.Statement, bool) {
seen[col.Name] = true
// If this is the first column and it's invertible (i.e., JSONB), make an inverted index.
if len(cols) == 0 &&
colinfo.ColumnTypeIsInvertedIndexable(tree.MustBeStaticallyKnownType(col.Type)) {
colinfo.ColumnTypeIsOnlyInvertedIndexable(tree.MustBeStaticallyKnownType(col.Type)) {
inverted = true
unique = false
cols = append(cols, tree.IndexElem{
Expand Down
20 changes: 20 additions & 0 deletions pkg/sql/catalog/catpb/catalog.proto
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,23 @@ message AutoStatsSettings {
// FractionStaleRows is table setting sql_stats_automatic_collection_fraction_stale_rows.
optional double fraction_stale_rows = 3;
}

// InvertedIndexColumnKind is the kind of the inverted index on a column. The
// reason this needs to be stored is that we need to be able to check that the
// "opclass" passed into an inverted index declaration (for example,
// gin_trgm_ops) is compatible with the datatype of a particular column
// (gin_tgrm_ops is only valid on text). A future reason is that it's possible
// to desire having more than one type of inverted index on a particular
// datatype - for example, you might want to create a "stemming" inverted index
// on text. And without this extra kind, it wouldn't be possible to distinguish
// a text inverted index that uses trigrams, vs a text inverted index that uses
// stemming.
enum InvertedIndexColumnKind {
// DEFAULT is the default kind of inverted index column. JSON, Array, and
// geo inverted indexes all are DEFAULT, though prior to 22.2 they had no
// kind at all.
DEFAULT = 0;
// TRIGRAM is the trigram kind of inverted index column. It's only valid on
// text columns.
TRIGRAM = 1;
}
24 changes: 20 additions & 4 deletions pkg/sql/catalog/colinfo/col_type_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,18 +123,34 @@ func ColumnTypeIsIndexable(t *types.T) bool {
}
// Some inverted index types also have a key encoding, but we don't
// want to support those yet. See #50659.
return !MustBeValueEncoded(t) && !ColumnTypeIsInvertedIndexable(t)
return !MustBeValueEncoded(t) && !ColumnTypeIsOnlyInvertedIndexable(t)
}

// ColumnTypeIsInvertedIndexable returns whether the type t is valid to be indexed
// using an inverted index.
func ColumnTypeIsInvertedIndexable(t *types.T) bool {
switch t.Family() {
case types.StringFamily:
return true
}
return ColumnTypeIsOnlyInvertedIndexable(t)
}

// ColumnTypeIsOnlyInvertedIndexable returns true if the type t is only
// indexable via an inverted index.
func ColumnTypeIsOnlyInvertedIndexable(t *types.T) bool {
if t.IsAmbiguous() || t.Family() == types.TupleFamily {
return false
}
family := t.Family()
return family == types.JsonFamily || family == types.ArrayFamily ||
family == types.GeographyFamily || family == types.GeometryFamily
switch t.Family() {
case types.JsonFamily:
case types.ArrayFamily:
case types.GeographyFamily:
case types.GeometryFamily:
default:
return false
}
return true
}

// MustBeValueEncoded returns true if columns of the given kind can only be value
Expand Down
7 changes: 6 additions & 1 deletion pkg/sql/catalog/descpb/structured.proto
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,11 @@ message IndexDescriptor {
// columns which are explicitly part of the index (STORING clause).
repeated string store_column_names = 5;

// An ordered list of opclasses that parallels each of the inverted columns
// in the index. n.b.: currently, there can only be a single inverted column
// in an index, so this list will always be of size 0 or 1.
repeated cockroach.sql.catalog.catpb.InvertedIndexColumnKind inverted_column_kinds = 27;

// An ordered list of column IDs of which the index key is comprised. This
// list parallels the key_column_names list and does not include any
// additional stored columns. If the index is an inverted index, the last
Expand Down Expand Up @@ -490,7 +495,7 @@ message IndexDescriptor {
optional uint32 constraint_id = 26 [(gogoproto.customname) = "ConstraintID",
(gogoproto.casttype) = "ConstraintID", (gogoproto.nullable) = false];

// Next ID: 27
// Next ID: 28
}

// ConstraintToUpdate represents a constraint to be added to the table and
Expand Down
4 changes: 4 additions & 0 deletions pkg/sql/catalog/table_elements.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,10 @@ type Index interface {
// Panics if the index is not inverted.
InvertedColumnKeyType() *types.T

// InvertedColumnKind returns the kind of the inverted column of the inverted
// index.
InvertedColumnKind() catpb.InvertedIndexColumnKind

NumPrimaryStoredColumns() int
NumSecondaryStoredColumns() int
GetStoredColumnID(storedColumnOrdinal int) descpb.ColumnID
Expand Down
17 changes: 17 additions & 0 deletions pkg/sql/catalog/tabledesc/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/iterutil"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
)

var _ catalog.Index = (*index)(nil)
Expand Down Expand Up @@ -176,6 +177,22 @@ func (w index) InvertedColumnKeyType() *types.T {
return w.desc.InvertedColumnKeyType()
}

// InvertedColumnKind returns the kind of the inverted column of the inverted
// index.
//
// Panics if the index is not inverted.
func (w index) InvertedColumnKind() catpb.InvertedIndexColumnKind {
if w.desc.Type != descpb.IndexDescriptor_INVERTED {
panic(errors.AssertionFailedf("index is not inverted"))
}
if len(w.desc.InvertedColumnKinds) == 0 {
// Not every inverted index has kinds inside, since no kinds were set prior
// to version 22.2.
return catpb.InvertedIndexColumnKind_DEFAULT
}
return w.desc.InvertedColumnKinds[0]
}

// CollectKeyColumnIDs creates a new set containing the column IDs in the key
// of this index.
func (w index) CollectKeyColumnIDs() catalog.TableColSet {
Expand Down
24 changes: 14 additions & 10 deletions pkg/sql/catalog/tabledesc/structured.go
Original file line number Diff line number Diff line change
Expand Up @@ -1085,16 +1085,7 @@ func checkColumnsValidForInvertedIndex(tableDesc *Mutable, indexColNames []strin
// The last column indexed by an inverted index must be
// inverted indexable.
if i == lastCol && !colinfo.ColumnTypeIsInvertedIndexable(col.GetType()) {
return errors.WithHint(
pgerror.Newf(
pgcode.FeatureNotSupported,
"column %s of type %s is not allowed as the last column in an inverted index",
col.GetName(),
col.GetType().Name(),
),
"see the documentation for more information about inverted indexes: "+docs.URL("inverted-indexes.html"),
)

return NewInvalidInvertedColumnError(col.GetName(), col.GetType().String())
}
// Any preceding columns must not be inverted indexable.
if i < lastCol && !colinfo.ColumnTypeIsIndexable(col.GetType()) {
Expand All @@ -1114,6 +1105,19 @@ func checkColumnsValidForInvertedIndex(tableDesc *Mutable, indexColNames []strin
return nil
}

// NewInvalidInvertedColumnError returns an error for a column that's not
// inverted indexable.
func NewInvalidInvertedColumnError(colName, colType string) error {
return errors.WithHint(
pgerror.Newf(
pgcode.FeatureNotSupported,
"column %s of type %s is not allowed as the last column in an inverted index",
colName, colType,
),
"see the documentation for more information about inverted indexes: "+docs.URL("inverted-indexes.html"),
)
}

// AddColumn adds a column to the table.
func (desc *Mutable) AddColumn(col *descpb.ColumnDescriptor) {
desc.Columns = append(desc.Columns, *col)
Expand Down
3 changes: 2 additions & 1 deletion pkg/sql/catalog/tabledesc/validate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ var validationMap = []struct {
"StoreColumnNames": {
status: todoIAmKnowinglyAddingTechDebt,
reason: "initial import: TODO(features): add validation"},
"KeyColumnIDs": {status: iSolemnlySwearThisFieldIsValidated},
"InvertedColumnKinds": {status: thisFieldReferencesNoObjects},
"KeyColumnIDs": {status: iSolemnlySwearThisFieldIsValidated},
"KeySuffixColumnIDs": {
status: todoIAmKnowinglyAddingTechDebt,
reason: "initial import: TODO(features): add validation"},
Expand Down
96 changes: 86 additions & 10 deletions pkg/sql/create_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ import (
"context"
"time"

"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/docs"
"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
Expand All @@ -33,6 +35,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/sqlerrors"
"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented"
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
"github.com/cockroachdb/errors"
)
Expand Down Expand Up @@ -197,7 +200,9 @@ func makeIndexDescriptor(
CreatedAtNanos: params.EvalContext().GetTxnTimestamp(time.Microsecond).UnixNano(),
}

columnsToCheckForOpclass := columns
if n.Inverted {
columnsToCheckForOpclass = columns[:len(columns)-1]
if n.Sharded != nil {
return nil, pgerror.New(pgcode.InvalidSQLStatementName, "inverted indexes don't support hash sharding")
}
Expand All @@ -211,19 +216,21 @@ func makeIndexDescriptor(
}

indexDesc.Type = descpb.IndexDescriptor_INVERTED
column, err := tableDesc.FindColumnWithName(columns[len(columns)-1].Column)
invCol := columns[len(columns)-1]
column, err := tableDesc.FindColumnWithName(invCol.Column)
if err != nil {
return nil, err
}
switch column.GetType().Family() {
case types.GeometryFamily:
config, err := geoindex.GeometryIndexConfigForSRID(column.GetType().GeoSRIDOrZero())
if err != nil {
return nil, err
}
indexDesc.GeoConfig = *config
case types.GeographyFamily:
indexDesc.GeoConfig = *geoindex.DefaultGeographyIndexConfig()
if err := populateInvertedIndexDescriptor(
params.ctx, params.ExecCfg().Settings, column, &indexDesc, invCol); err != nil {
return nil, err
}
}

for i := range columnsToCheckForOpclass {
col := &columns[i]
if col.OpClass != "" {
return nil, newUndefinedOpclassError(col.OpClass)
}
}

Expand Down Expand Up @@ -303,6 +310,75 @@ func makeIndexDescriptor(
return &indexDesc, nil
}

// populateInvertedIndexDescriptor adds information to the input index descriptor
// for the inverted index given by the input column and invCol, which should
// match (column is the catalog column, and invCol is the grammar node of
// the column in the index creation statement).
func populateInvertedIndexDescriptor(
ctx context.Context,
cs *cluster.Settings,
column catalog.Column,
indexDesc *descpb.IndexDescriptor,
invCol tree.IndexElem,
) error {
indexDesc.InvertedColumnKinds = []catpb.InvertedIndexColumnKind{catpb.InvertedIndexColumnKind_DEFAULT}
switch column.GetType().Family() {
case types.ArrayFamily:
switch invCol.OpClass {
case "array_ops", "":
default:
return newUndefinedOpclassError(invCol.OpClass)
}
case types.JsonFamily:
switch invCol.OpClass {
case "jsonb_ops", "":
case "jsonb_path_ops":
return unimplemented.NewWithIssue(81115, "operator class \"jsonb_path_ops\" is not supported")
default:
return newUndefinedOpclassError(invCol.OpClass)
}
case types.GeometryFamily:
if invCol.OpClass != "" {
return newUndefinedOpclassError(invCol.OpClass)
}
config, err := geoindex.GeometryIndexConfigForSRID(column.GetType().GeoSRIDOrZero())
if err != nil {
return err
}
indexDesc.GeoConfig = *config
case types.GeographyFamily:
if invCol.OpClass != "" {
return newUndefinedOpclassError(invCol.OpClass)
}
indexDesc.GeoConfig = *geoindex.DefaultGeographyIndexConfig()
case types.StringFamily:
// Check the opclass of the last column in the list, which is the column
// we're going to inverted index.
switch invCol.OpClass {
case "gin_trgm_ops", "gist_trgm_ops":
if !cs.Version.IsActive(ctx, clusterversion.TrigramInvertedIndexes) {
return pgerror.Newf(pgcode.FeatureNotSupported,
"version %v must be finalized to create trigram inverted indexes",
clusterversion.ByKey(clusterversion.TrigramInvertedIndexes))
}
case "":
return errors.WithHint(
pgerror.New(pgcode.UndefinedObject, "data type text has no default operator class for access method \"gin\""),
"You must specify an operator class for the index (did you mean gin_trgm_ops?)")
default:
return newUndefinedOpclassError(invCol.OpClass)
}
indexDesc.InvertedColumnKinds[0] = catpb.InvertedIndexColumnKind_TRIGRAM
default:
return tabledesc.NewInvalidInvertedColumnError(column.GetName(), column.GetType().Name())
}
return nil
}

func newUndefinedOpclassError(opclass tree.Name) error {
return pgerror.Newf(pgcode.UndefinedObject, "operator class %q does not exist", opclass)
}

// validateColumnsAreAccessible validates that the columns for an index are
// accessible. This check must be performed before creating inaccessible columns
// for expression indexes with replaceExpressionElemsWithVirtualCols.
Expand Down
6 changes: 3 additions & 3 deletions pkg/sql/create_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
if err != nil {
return nil, err
}
isInvIndex := colinfo.ColumnTypeIsInvertedIndexable(col.GetType())
isInvIndex := colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType())
colStats = []jobspb.CreateStatsDetails_ColStat{{
ColumnIDs: columnIDs,
// By default, create histograms on all explicitly requested column stats
Expand Down Expand Up @@ -524,7 +524,7 @@ func createStatsDefaultColumns(
if err != nil {
return nil, err
}
isInverted := colinfo.ColumnTypeIsInvertedIndexable(col.GetType())
isInverted := colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType())
if err := addIndexColumnStatsIfNotExists(colID, isInverted); err != nil {
return nil, err
}
Expand Down Expand Up @@ -558,7 +558,7 @@ func createStatsDefaultColumns(
}
colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
ColumnIDs: colList,
HasHistogram: !colinfo.ColumnTypeIsInvertedIndexable(col.GetType()),
HasHistogram: !colinfo.ColumnTypeIsOnlyInvertedIndexable(col.GetType()),
HistogramMaxBuckets: maxHistBuckets,
})
nonIdxCols++
Expand Down
Loading

0 comments on commit 5bd3fdd

Please sign in to comment.