From 53c860b7691ff9b7422d5055d2700a3e9b7ebd4b Mon Sep 17 00:00:00 2001 From: Tommy Reilly Date: Fri, 17 Dec 2021 15:29:53 +0000 Subject: [PATCH] opt: fix like escape processing for span constraints Fixes: #44123 Previously no attempt was made to properly handle escape ('\\') sequence in like patterns being turned into constraints. Refactor code used to process like at runtime to generate a regexp and use that to properly handle index constraint generation. Release note (sql change): Escape character processing was missing from constraint span generation which resulted in incorrect results when doing escaped like lookups. Release justification: visible logical error --- .../testdata/logic_test/select_index | 32 ++++++- .../opt/idxconstraint/index_constraints.go | 33 ++++--- pkg/sql/opt/idxconstraint/testdata/strings | 89 +++++++++++++++++++ pkg/sql/opt/memo/constraint_builder.go | 30 ++++--- .../opt/memo/testdata/logprops/constraints | 41 +++++++++ pkg/sql/sem/tree/eval.go | 25 ++++-- 6 files changed, 219 insertions(+), 31 deletions(-) diff --git a/pkg/sql/logictest/testdata/logic_test/select_index b/pkg/sql/logictest/testdata/logic_test/select_index index f909ab720a5a..d7e41430fd25 100644 --- a/pkg/sql/logictest/testdata/logic_test/select_index +++ b/pkg/sql/logictest/testdata/logic_test/select_index @@ -254,7 +254,7 @@ statement ok CREATE TABLE str (k INT PRIMARY KEY, v STRING, INDEX(v)) statement ok -INSERT INTO str VALUES (1, 'A'), (4, 'AB'), (2, 'ABC'), (5, 'ABCD'), (3, 'ABCDEZ'), (9, 'ABD') +INSERT INTO str VALUES (1, 'A'), (4, 'AB'), (2, 'ABC'), (5, 'ABCD'), (3, 'ABCDEZ'), (9, 'ABD'), (10, '\CBA'), (11, 'A%'), (12, 'CAB.*'), (13, 'CABD') query IT rowsort SELECT k, v FROM str WHERE v LIKE 'ABC%' @@ -263,11 +263,41 @@ SELECT k, v FROM str WHERE v LIKE 'ABC%' 5 ABCD 3 ABCDEZ +query IT rowsort +SELECT k, v FROM str WHERE v LIKE '\ABC%' +---- +2 ABC +5 ABCD +3 ABCDEZ + +statement error LIKE regexp compilation failed: LIKE pattern must not end with escape character +SELECT k, v FROM str WHERE v LIKE 'ABC\' + +query IT rowsort +SELECT k, v FROM str WHERE v LIKE '\\CBA%' +---- +10 \CBA + +query IT rowsort +SELECT k, v FROM str WHERE v LIKE 'A\%' +---- +11 A% + +query IT rowsort +SELECT k, v FROM str WHERE v LIKE 'CAB.*' +---- +12 CAB.* + query IT rowsort SELECT k, v FROM str WHERE v LIKE 'ABC%Z' ---- 3 ABCDEZ +query IT rowsort +SELECT k, v FROM str WHERE v LIKE '\ABCDE_' +---- +3 ABCDEZ + query IT rowsort SELECT k, v FROM str WHERE v SIMILAR TO 'ABC_*' ---- diff --git a/pkg/sql/opt/idxconstraint/index_constraints.go b/pkg/sql/opt/idxconstraint/index_constraints.go index dbc7577b2b4a..190eb0c0bfd1 100644 --- a/pkg/sql/opt/idxconstraint/index_constraints.go +++ b/pkg/sql/opt/idxconstraint/index_constraints.go @@ -280,21 +280,28 @@ func (c *indexConstraintCtx) makeSpansForSingleColumnDatum( case opt.LikeOp: if s, ok := tree.AsDString(datum); ok { - if i := strings.IndexAny(string(s), "_%"); i >= 0 { - if i == 0 { - // Mask starts with _ or %. - c.unconstrained(offset, out) - return false + // Normalize the like pattern to a regexp. + if pattern, err := tree.LikeEscape(string(s)); err == nil { + if re, err := regexp.Compile(pattern); err == nil { + prefix, complete := re.LiteralPrefix() + if complete { + c.eqSpan(offset, tree.NewDString(prefix), out) + return true + } + if prefix == "" { + // Mask starts with _ or %. + c.unconstrained(offset, out) + return false + } + c.makeStringPrefixSpan(offset, prefix, out) + // If pattern is simply prefix + .* the span is tight. Also pattern + // will have regexp special chars escaped and so prefix needs to be + // escaped too. + if prefixEscape, err := tree.LikeEscape(prefix); err == nil { + return strings.HasSuffix(pattern, ".*") && strings.TrimSuffix(pattern, ".*") == prefixEscape + } } - c.makeStringPrefixSpan(offset, string(s[:i]), out) - // A mask like ABC% is equivalent to restricting the prefix to ABC. - // A mask like ABC%Z requires restricting the prefix, but is a stronger - // condition. - return (i == len(s)-1) && s[i] == '%' } - // No wildcard characters, this is an equality. - c.eqSpan(offset, &s, out) - return true } case opt.SimilarToOp: diff --git a/pkg/sql/opt/idxconstraint/testdata/strings b/pkg/sql/opt/idxconstraint/testdata/strings index a7b7dd70a335..0ee81a97124e 100644 --- a/pkg/sql/opt/idxconstraint/testdata/strings +++ b/pkg/sql/opt/idxconstraint/testdata/strings @@ -1,3 +1,80 @@ +index-constraints vars=(a string) index=a +a LIKE 'ABC' +---- +[/'ABC' - /'ABC'] + +# A backslash that isn't escaping anything is just removed from pattern. +index-constraints vars=(a string) index=a +a LIKE '\aABC%' +---- +[/'aABC' - /'aABD') + +# A backslash that isn't escaping anything is just removed from pattern. +index-constraints vars=(a string) index=a +a LIKE 'A\BC%' +---- +[/'ABC' - /'ABD') + +# Currently we punt on custom ESCAPE clauses. +index-constraints vars=(a string) index=a +a LIKE '\aABC%' ESCAPE '|' +---- +[ - ] +Remaining filter: like_escape(a, e'\\aABC%', '|') + +# Single char wildcard requires remaining filter. +index-constraints vars=(a string) index=a +a LIKE '\aABC_' +---- +[/'aABC' - /'aABD') +Remaining filter: a LIKE e'\\aABC_' + +# Ending with wildcard with other wildcards present isn't tight. +index-constraints vars=(a string) index=a +a LIKE 'AB_C%' +---- +[/'AB' - /'AC') +Remaining filter: a LIKE 'AB_C%' + +# Ignore zero prefix (wildcard at beginning). +index-constraints vars=(a string) index=a +a LIKE '%ABC' +---- +(/NULL - ] +Remaining filter: a LIKE '%ABC' + +# Ignore zero prefix (wildcard at beginning). +index-constraints vars=(a string) index=a +a LIKE '_ABC' +---- +(/NULL - ] +Remaining filter: a LIKE '_ABC' + +# A backslash that is escaping a wildcard becomes equality. +index-constraints vars=(a string) index=a +a LIKE 'ABC\%' +---- +[/'ABC%' - /'ABC%'] + +# A backslash that is escaping a wildcard becomes equality. +index-constraints vars=(a string) index=a +a LIKE 'ABC\_' +---- +[/'ABC_' - /'ABC_'] + +# A backslash that is escaping a wildcard becomes equality. +index-constraints vars=(a string) index=a +a LIKE 'ABC\_Z' +---- +[/'ABC_Z' - /'ABC_Z'] + +# Invalid pattern does not generate index constraints. +index-constraints vars=(a string) index=a +a LIKE 'ABC\' +---- +(/NULL - ] +Remaining filter: a LIKE e'ABC\\' + index-constraints vars=(a string) index=a a LIKE 'ABC%' ---- @@ -60,6 +137,18 @@ a SIMILAR TO 'ABC.*Z' [/'ABC' - /'ABD') Remaining filter: a SIMILAR TO 'ABC.*Z' +index-constraints vars=(a string) index=(a) +a SIMILAR TO 'ABC%Z' +---- +[/'ABC' - /'ABD') +Remaining filter: a SIMILAR TO 'ABC%Z' + +index-constraints vars=(a string) index=(a) +a SIMILAR TO 'ABC_Z' +---- +[/'ABC' - /'ABD') +Remaining filter: a SIMILAR TO 'ABC_Z' + index-constraints vars=(a string) index=(a) a SIMILAR TO 'ABC' ---- diff --git a/pkg/sql/opt/memo/constraint_builder.go b/pkg/sql/opt/memo/constraint_builder.go index 64645e40cbac..da8bef95e9b4 100644 --- a/pkg/sql/opt/memo/constraint_builder.go +++ b/pkg/sql/opt/memo/constraint_builder.go @@ -176,20 +176,26 @@ func (cb *constraintsBuilder) buildSingleColumnConstraintConst( case opt.LikeOp: if s, ok := tree.AsDString(datum); ok { - if i := strings.IndexAny(string(s), "_%"); i >= 0 { - if i == 0 { - // Mask starts with _ or %. - return unconstrained, false + // Normalize the like pattern to a RE + if pattern, err := tree.LikeEscape(string(s)); err == nil { + if re, err := regexp.Compile(pattern); err == nil { + prefix, complete := re.LiteralPrefix() + if complete { + return cb.eqSpan(col, tree.NewDString(prefix)), true + } + if prefix == "" { + // Mask starts with _ or %. + return unconstrained, false + } + c := cb.makeStringPrefixSpan(col, prefix) + // If pattern is simply prefix + .* the span is tight. Also pattern + // will have regexp special chars escaped and so prefix needs to be + // escaped too. The original string may have superfluous escape + if prefixEscape, err := tree.LikeEscape(prefix); err == nil { + return c, strings.HasSuffix(pattern, ".*") && strings.TrimSuffix(pattern, ".*") == prefixEscape + } } - c := cb.makeStringPrefixSpan(col, string(s[:i])) - // A mask like ABC% is equivalent to restricting the prefix to ABC. - // A mask like ABC%Z requires restricting the prefix, but is a stronger - // condition. - tight := (i == len(s)-1) && s[i] == '%' - return c, tight } - // No wildcard characters, this is an equality. - return cb.eqSpan(col, &s), true } case opt.SimilarToOp: diff --git a/pkg/sql/opt/memo/testdata/logprops/constraints b/pkg/sql/opt/memo/testdata/logprops/constraints index 78b86b3c8523..00fab172965e 100644 --- a/pkg/sql/opt/memo/testdata/logprops/constraints +++ b/pkg/sql/opt/memo/testdata/logprops/constraints @@ -1053,6 +1053,47 @@ select ├── variable: v:3 [type=string] └── const: 'ABC%' [type=string] +opt +SELECT * FROM kuv WHERE v LIKE '\ABC%' +---- +select + ├── columns: k:1(int!null) u:2(float) v:3(string!null) + ├── key: (1) + ├── fd: (1)-->(2,3) + ├── prune: (1,2) + ├── interesting orderings: (+1) + ├── scan kuv + │ ├── columns: k:1(int!null) u:2(float) v:3(string) + │ ├── key: (1) + │ ├── fd: (1)-->(2,3) + │ ├── prune: (1-3) + │ └── interesting orderings: (+1) + └── filters + └── like [type=bool, outer=(3), constraints=(/3: [/'ABC' - /'ABD'); tight)] + ├── variable: v:3 [type=string] + └── const: e'\\ABC%' [type=string] + +# Like doesn't support RE syntax. +opt +SELECT * FROM kuv WHERE v LIKE 'ABC.*' +---- +select + ├── columns: k:1(int!null) u:2(float) v:3(string!null) + ├── key: (1) + ├── fd: ()-->(3), (1)-->(2) + ├── prune: (1,2) + ├── interesting orderings: (+1 opt(3)) + ├── scan kuv + │ ├── columns: k:1(int!null) u:2(float) v:3(string) + │ ├── key: (1) + │ ├── fd: (1)-->(2,3) + │ ├── prune: (1-3) + │ └── interesting orderings: (+1) + └── filters + └── like [type=bool, outer=(3), constraints=(/3: [/'ABC.*' - /'ABC.*']; tight), fd=()-->(3)] + ├── variable: v:3 [type=string] + └── const: 'ABC.*' [type=string] + opt SELECT * FROM kuv WHERE v LIKE 'ABC_' ---- diff --git a/pkg/sql/sem/tree/eval.go b/pkg/sql/sem/tree/eval.go index 04920c2ba98d..d5eadcd8da23 100644 --- a/pkg/sql/sem/tree/eval.go +++ b/pkg/sql/sem/tree/eval.go @@ -4928,6 +4928,13 @@ type likeKey struct { escape rune } +// LikeEscape converts a like pattern to a regexp pattern. +func LikeEscape(pattern string) (string, error) { + key := likeKey{s: pattern, caseInsensitive: false, escape: '\\'} + re, err := key.patternNoAnchor() + return re, err +} + // unescapePattern unescapes a pattern for a given escape token. // It handles escaped escape tokens properly by maintaining them as the escape // token in the return string. @@ -5287,11 +5294,7 @@ func calculateLengthAfterReplacingCustomEscape(s string, escape rune) (bool, int return changed, retLen, nil } -// Pattern implements the RegexpCacheKey interface. -// The strategy for handling custom escape character -// is to convert all unescaped escape character into '\'. -// k.escape can either be empty or a single character. -func (k likeKey) Pattern() (string, error) { +func (k likeKey) patternNoAnchor() (string, error) { // QuoteMeta escapes all regexp metacharacters (`\.+*?()|[]{}^$`) with a `\`. pattern := regexp.QuoteMeta(k.s) var err error @@ -5368,6 +5371,18 @@ func (k likeKey) Pattern() (string, error) { } } + return pattern, nil +} + +// Pattern implements the RegexpCacheKey interface. +// The strategy for handling custom escape character +// is to convert all unescaped escape character into '\'. +// k.escape can either be empty or a single character. +func (k likeKey) Pattern() (string, error) { + pattern, err := k.patternNoAnchor() + if err != nil { + return "", err + } return anchorPattern(pattern, k.caseInsensitive), nil }