Skip to content

Commit

Permalink
Merge #65437
Browse files Browse the repository at this point in the history
65437: opt: normalize CollateExpr Locale r=mgartner a=mgartner

#### opt: format CollateExpr.Locale in opt trees

Formatted opt trees now include the `Locale` field of `CollateExpr`s.
Previously, the `Locale` was never shown.

Release note: None

#### opt: normalize CollateExpr Locale

This commit normalizes the `Locale` string of a `CollateExpr` when the
expression is built in optbuilder. Normalization of this string ensures
that collated string expressions with different but equivalent locales
are considered equal. For example, the expressions `s COLLATE "en_us"`
and `s COLLATE "en-US"` are equivalent, but prior to this commit they
would be considered non-equivalent.

This change allows crucial optimizer rules to apply in more cases, like
`GenerateConstrainedScans`.

Consider the table:

    CREATE TABLE t (
      s STRING,
      c STRING COLLATE en_US AS (s COLLATE en_US) VIRTUAL,
      INDEX (c)
    )

None of the following queries would perform a constrained scan on the
secondary index because the collated expressions on the left side of the
`=` were not considered equal to the virtual column expression.

    SELECT * FROM t WHERE s COLLATED "en_US" = 'foo' COLLATE en_US
    SELECT * FROM t WHERE s COLLATED "en-US" = 'foo' COLLATE en_US
    SELECT * FROM t WHERE s COLLATED "en-us" = 'foo' COLLATE en_US

The locale is normalized in optbuild rather than in a normalization rule
for the sake of efficiency. A normalization rule would have to check
that a locale is not already normalized to prevent an infinite
normalization loop. This would require normalizing the locale multiple
times: at least once to normalize and at least once more in the
recursive call to `ConstructCollate` to detect if the locale was already
normalized. Normalizing the locale in optbuilder requires only
normalizing the locale once.

Fixes #65343

Release note (performance improvement): The optimizer now generates
query plans that scan indexes on virtual collated string columns,
regardless of the casing or formatting of the collated locale in the
query.


Co-authored-by: Marcus Gartner <[email protected]>
  • Loading branch information
craig[bot] and mgartner committed May 20, 2021
2 parents 7e2aa98 + 4b1a92a commit 1bb90a3
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 4 deletions.
9 changes: 9 additions & 0 deletions pkg/sql/lex/encode.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,15 @@ func EncodeSQLStringWithFlags(buf *bytes.Buffer, in string, flags lexbase.Encode
}
}

// NormalizeLocaleName returns a normalized locale identifier based on s. The
// case of the locale is normalized and any dash characters are mapped to
// underscore characters.
func NormalizeLocaleName(s string) string {
b := bytes.NewBuffer(make([]byte, 0, len(s)))
EncodeLocaleName(b, s)
return b.String()
}

// EncodeLocaleName writes the locale identifier in s to buf. Any dash
// characters are mapped to underscore characters. Underscore characters do not
// need to be quoted, and they are considered equivalent to dash characters by
Expand Down
76 changes: 76 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/virtual_columns
Original file line number Diff line number Diff line change
Expand Up @@ -1443,3 +1443,79 @@ vectorized: true
estimated row count: 3 (missing stats)
table: inv@iv_jv_idx
spans: /10/"a"/"b"-/10/"a"/"b"/PrefixEnd /20/"a"/"b"-/20/"a"/"b"/PrefixEnd /30/"a"/"b"-/30/"a"/"b"/PrefixEnd

# Regression tests for #65343. Collated string locales should be normalized so
# that indexes on collated string virtual columns are scanned for any given
# locale format in a query.
subtest 65343

statement ok
CREATE TABLE t65343 (
s STRING,
c STRING COLLATE en_US AS (s COLLATE en_US) VIRTUAL,
INDEX (c)
)

query T
EXPLAIN SELECT * FROM t65343 WHERE s COLLATE en_US = 'foo' COLLATE en_US
----
distribution: local
vectorized: true
·
• render
└── • index join
│ table: t65343@primary
└── • scan
missing stats
table: t65343@t65343_c_idx
spans: [/'foo' COLLATE en_US - /'foo' COLLATE en_US]

query T
EXPLAIN SELECT * FROM t65343 WHERE s COLLATE "en_US" = 'foo' COLLATE en_US
----
distribution: local
vectorized: true
·
• render
└── • index join
│ table: t65343@primary
└── • scan
missing stats
table: t65343@t65343_c_idx
spans: [/'foo' COLLATE en_US - /'foo' COLLATE en_US]

query T
EXPLAIN SELECT * FROM t65343 WHERE s COLLATE "en_us" = 'foo' COLLATE en_US
----
distribution: local
vectorized: true
·
• render
└── • index join
│ table: t65343@primary
└── • scan
missing stats
table: t65343@t65343_c_idx
spans: [/'foo' COLLATE en_US - /'foo' COLLATE en_US]

query T
EXPLAIN SELECT * FROM t65343 WHERE s COLLATE "en-US" = 'foo' COLLATE en_US
----
distribution: local
vectorized: true
·
• render
└── • index join
│ table: t65343@primary
└── • scan
missing stats
table: t65343@t65343_c_idx
spans: [/'foo' COLLATE en_US - /'foo' COLLATE en_US]
5 changes: 4 additions & 1 deletion pkg/sql/opt/memo/expr_format.go
Original file line number Diff line number Diff line change
Expand Up @@ -1008,10 +1008,13 @@ func (f *ExprFmtCtx) FormatScalarProps(scalar opt.ScalarExpr) {
func (f *ExprFmtCtx) formatScalarPrivate(scalar opt.ScalarExpr) {
var private interface{}
switch t := scalar.(type) {
case *NullExpr, *TupleExpr, *CollateExpr:
case *NullExpr, *TupleExpr:
// Private is redundant with logical type property.
private = nil

case *CollateExpr:
fmt.Fprintf(f.Buffer, " locale='%s'", t.Locale)

case *AnyExpr:
// We don't want to show the OriginalExpr; just show Cmp.
private = t.Cmp
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/opt/optbuilder/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ go_library(
"//pkg/sql/catalog/schemaexpr",
"//pkg/sql/catalog/typedesc",
"//pkg/sql/delegate",
"//pkg/sql/lex",
"//pkg/sql/opt",
"//pkg/sql/opt/cat",
"//pkg/sql/opt/memo",
Expand Down
3 changes: 2 additions & 1 deletion pkg/sql/opt/optbuilder/scalar.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"fmt"

"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/sql/lex"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
"github.com/cockroachdb/cockroach/pkg/sql/opt/memo"
Expand Down Expand Up @@ -133,7 +134,7 @@ func (b *Builder) buildScalar(

case *tree.CollateExpr:
in := b.buildScalar(t.Expr.(tree.TypedExpr), inScope, nil, nil, colRefs)
out = b.factory.ConstructCollate(in, t.Locale)
out = b.factory.ConstructCollate(in, lex.NormalizeLocaleName(t.Locale))

case *tree.ArrayFlatten:
s := t.Subquery.(*subquery)
Expand Down
50 changes: 48 additions & 2 deletions pkg/sql/opt/optbuilder/testdata/scalar
Original file line number Diff line number Diff line change
Expand Up @@ -821,9 +821,32 @@ concat [type=jsonb]
build-scalar
'hello' COLLATE en
----
collate [type=collatedstring{en}]
collate locale='en' [type=collatedstring{en}]
└── const: 'hello' [type=string]

build-scalar
'hello' COLLATE en_US
----
collate locale='en_US' [type=collatedstring{en_US}]
└── const: 'hello' [type=string]

build-scalar
'hello' COLLATE "en_US"
----
collate locale='en_US' [type=collatedstring{en_US}]
└── const: 'hello' [type=string]

build-scalar
'hello' COLLATE "en-US"
----
collate locale='en_US' [type=collatedstring{en_US}]
└── const: 'hello' [type=string]

build-scalar
'hello' COLLATE "foo"
----
error: invalid locale foo: language: subtag "foo" is well-formed but unknown

build-scalar
random()
----
Expand Down Expand Up @@ -1153,9 +1176,32 @@ tuple [type=tuple{tuple{bool, bool, bool}, tuple{bool, bool, bool}, tuple{bool,
build-scalar vars=(a string)
a COLLATE en
----
collate [type=collatedstring{en}]
collate locale='en' [type=collatedstring{en}]
└── variable: a:1 [type=string]

build-scalar vars=(a string)
a COLLATE en_US
----
collate locale='en_US' [type=collatedstring{en_US}]
└── variable: a:1 [type=string]

build-scalar vars=(a string)
a COLLATE "en_US"
----
collate locale='en_US' [type=collatedstring{en_US}]
└── variable: a:1 [type=string]

build-scalar vars=(a string)
a COLLATE "en-US"
----
collate locale='en_US' [type=collatedstring{en_US}]
└── variable: a:1 [type=string]

build-scalar vars=(a string)
a COLLATE "foo"
----
error: invalid locale foo: language: subtag "foo" is well-formed but unknown

exec-ddl
CREATE TABLE u (x INT)
----
Expand Down

0 comments on commit 1bb90a3

Please sign in to comment.