From a16127ea22ae1438afdc55aa740f56695c52d5b0 Mon Sep 17 00:00:00 2001 From: Angela Xu Date: Fri, 12 Feb 2021 12:41:48 -0800 Subject: [PATCH] opt: inverted-index accelerate filters of the form j->'a' = '{"b": "c"}' Previously, the optimizer did not plan inverted index scans for queries with the JSON fetch value operator `->` when an object or array was on the right side of the equality operator `=`, only when it was a boolean, string, number, or null. This change allows the inverted index to be used in these types of queries. It supports objects with multiple key/value pairs, nested objects, arrays, arrays nested within objects, and objects nested within arrays. Fixes: #59605 Release note: None --- .../testdata/logic_test/inverted_index | 93 +++++- pkg/sql/opt/invertedidx/json_array.go | 21 +- pkg/sql/opt/memo/testdata/stats/inverted-json | 283 ++++++++++++++++++ 3 files changed, 388 insertions(+), 9 deletions(-) diff --git a/pkg/sql/logictest/testdata/logic_test/inverted_index b/pkg/sql/logictest/testdata/logic_test/inverted_index index 4ec455b113d6..a18cd46d3514 100644 --- a/pkg/sql/logictest/testdata/logic_test/inverted_index +++ b/pkg/sql/logictest/testdata/logic_test/inverted_index @@ -752,7 +752,22 @@ INSERT INTO f VALUES (10, '{"a": {"b": {"c": 1}}}'), (11, '{"a": {"b": {"c": 1, "d": 2}}}}'), (12, '{"a": {"b": {"d": 2}}}}'), - (13, '{"a": {"b": {"c": [1, 2]}}}') + (13, '{"a": {"b": {"c": [1, 2]}}}'), + (14, '{"a": {"b": {"c": [1, 2, 3]}}}'), + (15, '{"a": []}'), + (16, '{"a": {}}}'), + (17, '{"a": {"b": "c"}}'), + (18, '{"a": {"b": ["c", "d", "e"]}}'), + (19, '{"a": ["b", "c", "d", "e"]}'), + (20, '{"a": ["b", "e", "c", "d"]}'), + (21, '{"z": {"a": "b", "c": "d"}}'), + (22, '{"z": {"a": "b", "c": "d", "e": "f"}}'), + (23, '{"a": "b", "x": ["c", "d", "e"]}}'), + (24, '{"a": "b", "c": [{"d": 1}, {"e": 2}]}}'), + (25, '{"a": {"b": "c", "d": "e"}}'), + (26, '{"a": {"b": "c"}, "d": "e"}'), + (27, '[1, 2, {"b": "c"}]'), + (28, '[{"a": {"b": "c"}}, "d", "e"]') query T SELECT j FROM f@i WHERE j->'a' = '1' ORDER BY k @@ -789,6 +804,82 @@ SELECT j FROM f@i WHERE j->'a'->'b'->'c' = '1' ORDER BY k {"a": {"b": {"c": 1}}} {"a": {"b": {"c": 1, "d": 2}}} +query T +SELECT j FROM f@i WHERE j->'a' = '[]' ORDER BY k +---- +{"a": []} + +query T +SELECT j FROM f@i WHERE j->'a' = '{}' ORDER BY k +---- +{"a": {}} + +query T +SELECT j FROM f@i WHERE j->'a' = '["b"]' ORDER BY k +---- + +query T +SELECT j FROM f@i WHERE j->'a' = '"b"' ORDER BY k +---- +{"a": "b", "x": ["c", "d", "e"]} +{"a": "b", "c": [{"d": 1}, {"e": 2}]} + +query T +SELECT j FROM f@i WHERE j->'a' = '{"b": "c"}' ORDER BY k +---- +{"a": {"b": "c"}} +{"a": {"b": "c"}, "d": "e"} + +query T +SELECT j FROM f@i WHERE j->'a'->'b'->'c' = '[1, 2]' ORDER BY k +---- +{"a": {"b": {"c": [1, 2]}}} + +query T +SELECT j FROM f@i WHERE j->'z' = '{"a": "b", "c": "d"}' ORDER BY k +---- +{"z": {"a": "b", "c": "d"}} + +query T +SELECT j FROM f@i WHERE j->'a' = '["b", "c", "d", "e"]' ORDER BY k +---- +{"a": ["b", "c", "d", "e"]} + +query T +SELECT j FROM f@i WHERE j->'a' = '["b", "c", "d", "e"]' OR j->'a' = '["b", "e", "c", "d"]' ORDER BY k +---- +{"a": ["b", "c", "d", "e"]} +{"a": ["b", "e", "c", "d"]} + +query T +SELECT j FROM f@i WHERE j->'a' = '{"b": ["c", "d", "e"]}' ORDER BY k +---- +{"a": {"b": ["c", "d", "e"]}} + +query T +SELECT j FROM f@i WHERE j->'a'->'b' = '["c", "d", "e"]' ORDER BY k +---- +{"a": {"b": ["c", "d", "e"]}} + +query T +SELECT j FROM f@i WHERE j->'z'->'c' = '"d"' ORDER BY k +---- +{"z": {"a": "b", "c": "d"}} +{"z": {"a": "b", "c": "d", "e": "f"}} + +query T +SELECT j FROM f@i WHERE j->'z' = '{"c": "d"}' ORDER BY k +---- + +query T +SELECT j FROM f@i WHERE j->'a' = '"b"' AND j->'c' = '[{"d": 1}]' ORDER BY k +---- + +query T +SELECT j FROM f@i WHERE j->'a' = '"b"' AND j->'c' = '[{"d": 1}, {"e": 2}]' ORDER BY k +---- +{"a": "b", "c": [{"d": 1}, {"e": 2}]} + subtest arrays statement ok diff --git a/pkg/sql/opt/invertedidx/json_array.go b/pkg/sql/opt/invertedidx/json_array.go index 680da639cf31..a13313388588 100644 --- a/pkg/sql/opt/invertedidx/json_array.go +++ b/pkg/sql/opt/invertedidx/json_array.go @@ -342,12 +342,11 @@ func (j *jsonOrArrayFilterPlanner) extractJSONOrArrayContainsCondition( // expression in the form [col]->[index0]->[index1]->...->[indexN] where col is // a variable or expression referencing the inverted column in the inverted // index and each index is a constant string. The right expression must be a -// constant JSON value that is not an object or an array. +// constant JSON value. func (j *jsonOrArrayFilterPlanner) extractJSONFetchValEqCondition( evalCtx *tree.EvalContext, left *memo.FetchValExpr, right opt.ScalarExpr, ) inverted.Expression { - // The right side of the equals expression should be a constant JSON value - // that is not an object or array. + // The right side of the equals expression should be a constant JSON value. if !memo.CanExtractConstDatum(right) { return inverted.NonInvertedColExpression{} } @@ -355,10 +354,6 @@ func (j *jsonOrArrayFilterPlanner) extractJSONFetchValEqCondition( if !ok { return inverted.NonInvertedColExpression{} } - typ := val.JSON.Type() - if typ == json.ObjectJSONType || typ == json.ArrayJSONType { - return inverted.NonInvertedColExpression{} - } // Recursively traverse fetch val expressions and collect keys with which to // build the InvertedExpression. If it is not possible to build an inverted @@ -431,5 +426,15 @@ func (j *jsonOrArrayFilterPlanner) extractJSONFetchValEqCondition( obj = b.Build() } - return getInvertedExprForJSONOrArrayIndex(evalCtx, tree.NewDJSON(obj)) + invertedExpr := getInvertedExprForJSONOrArrayIndex(evalCtx, tree.NewDJSON(obj)) + + // When the right side is an array or object, the InvertedExpression + // generated is not tight. We must indicate it is non-tight so an additional + // filter is added. + typ := val.JSON.Type() + if typ == json.ArrayJSONType || typ == json.ObjectJSONType { + invertedExpr.SetNotTight() + } + + return invertedExpr } diff --git a/pkg/sql/opt/memo/testdata/stats/inverted-json b/pkg/sql/opt/memo/testdata/stats/inverted-json index c1c6e08d5f5e..1cf26f1dc0dc 100644 --- a/pkg/sql/opt/memo/testdata/stats/inverted-json +++ b/pkg/sql/opt/memo/testdata/stats/inverted-json @@ -383,3 +383,286 @@ index-join t │ <--- '\x37000138' --- '\x37000139' ├── key: (1) └── fd: (1)-->(4) + +# A query with the fetch val operator with a single key/val pair object on the +# right side uses the inverted index, and the inverted expression is not tight. +opt +SELECT * FROM t WHERE j->'a' = '{"b": "c"}' +---- +select + ├── columns: k:1(int!null) j:2(jsonb) + ├── immutable + ├── stats: [rows=222.222222] + ├── key: (1) + ├── fd: (1)-->(2) + ├── index-join t + │ ├── columns: k:1(int!null) j:2(jsonb) + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ ├── fd: (1)-->(2) + │ └── scan t@j_idx + │ ├── columns: k:1(int!null) + │ ├── inverted constraint: /4/1 + │ │ └── spans: ["7a\x00\x02b\x00\x01\x12c\x00\x01", "7a\x00\x02b\x00\x01\x12c\x00\x01"] + │ ├── stats: [rows=2e-07, distinct(4)=2e-07, null(4)=0] + │ │ histogram(4)= + │ └── key: (1) + └── filters + └── (j:2->'a') = '{"b": "c"}' [type=bool, outer=(2), immutable] + +# A query with the fetch val operator with a nested object on the right side +# uses the inverted index, and the inverted expression is not tight. +opt +SELECT * FROM t WHERE j->'a' = '{"b": {"c": "d"}}' +---- +select + ├── columns: k:1(int!null) j:2(jsonb) + ├── immutable + ├── stats: [rows=222.222222] + ├── key: (1) + ├── fd: (1)-->(2) + ├── index-join t + │ ├── columns: k:1(int!null) j:2(jsonb) + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ ├── fd: (1)-->(2) + │ └── scan t@j_idx + │ ├── columns: k:1(int!null) + │ ├── inverted constraint: /4/1 + │ │ └── spans: ["7a\x00\x02b\x00\x02c\x00\x01\x12d\x00\x01", "7a\x00\x02b\x00\x02c\x00\x01\x12d\x00\x01"] + │ ├── stats: [rows=2e-07, distinct(4)=2e-07, null(4)=0] + │ │ histogram(4)= + │ └── key: (1) + └── filters + └── (j:2->'a') = '{"b": {"c": "d"}}' [type=bool, outer=(2), immutable] + +# A query with the fetch val operator with an object on the right side +# with multiple key/val pairs uses the inverted index, and the inverted +# expression is not tight. +opt +SELECT * FROM t WHERE j->'a' = '{"b": "c", "d": "e"}' +---- +select + ├── columns: k:1(int!null) j:2(jsonb) + ├── immutable + ├── stats: [rows=222.222222] + ├── key: (1) + ├── fd: (1)-->(2) + ├── index-join t + │ ├── columns: k:1(int!null) j:2(jsonb) + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ ├── fd: (1)-->(2) + │ └── inverted-filter + │ ├── columns: k:1(int!null) + │ ├── inverted expression: /4 + │ │ ├── tight: false, unique: true + │ │ ├── union spans: empty + │ │ └── INTERSECTION + │ │ ├── span expression + │ │ │ ├── tight: true, unique: true + │ │ │ └── union spans: ["7a\x00\x02b\x00\x01\x12c\x00\x01", "7a\x00\x02b\x00\x01\x12c\x00\x01"] + │ │ └── span expression + │ │ ├── tight: true, unique: true + │ │ └── union spans: ["7a\x00\x02d\x00\x01\x12e\x00\x01", "7a\x00\x02d\x00\x01\x12e\x00\x01"] + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ └── scan t@j_idx + │ ├── columns: k:1(int!null) j_inverted_key:4(jsonb!null) + │ ├── inverted constraint: /4/1 + │ │ └── spans + │ │ ├── ["7a\x00\x02b\x00\x01\x12c\x00\x01", "7a\x00\x02b\x00\x01\x12c\x00\x01"] + │ │ └── ["7a\x00\x02d\x00\x01\x12e\x00\x01", "7a\x00\x02d\x00\x01\x12e\x00\x01"] + │ ├── stats: [rows=2e-07, distinct(1)=2e-07, null(1)=0, distinct(4)=2e-07, null(4)=0] + │ │ histogram(4)= + │ ├── key: (1) + │ └── fd: (1)-->(4) + └── filters + └── (j:2->'a') = '{"b": "c", "d": "e"}' [type=bool, outer=(2), immutable] + +# A query with the fetch val operator with an array on the right side +# uses the inverted index, and the inverted expression is not tight. +opt +SELECT * FROM t WHERE j->'a' = '["b", "c", "d", "e"]' +---- +select + ├── columns: k:1(int!null) j:2(jsonb) + ├── immutable + ├── stats: [rows=222.222222] + ├── key: (1) + ├── fd: (1)-->(2) + ├── index-join t + │ ├── columns: k:1(int!null) j:2(jsonb) + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ ├── fd: (1)-->(2) + │ └── inverted-filter + │ ├── columns: k:1(int!null) + │ ├── inverted expression: /4 + │ │ ├── tight: false, unique: true + │ │ ├── union spans: empty + │ │ └── INTERSECTION + │ │ ├── span expression + │ │ │ ├── tight: true, unique: true + │ │ │ ├── union spans: empty + │ │ │ └── INTERSECTION + │ │ │ ├── span expression + │ │ │ │ ├── tight: true, unique: true + │ │ │ │ ├── union spans: empty + │ │ │ │ └── INTERSECTION + │ │ │ │ ├── span expression + │ │ │ │ │ ├── tight: true, unique: true + │ │ │ │ │ └── union spans: ["7a\x00\x02\x00\x03\x00\x01\x12b\x00\x01", "7a\x00\x02\x00\x03\x00\x01\x12b\x00\x01"] + │ │ │ │ └── span expression + │ │ │ │ ├── tight: true, unique: true + │ │ │ │ └── union spans: ["7a\x00\x02\x00\x03\x00\x01\x12c\x00\x01", "7a\x00\x02\x00\x03\x00\x01\x12c\x00\x01"] + │ │ │ └── span expression + │ │ │ ├── tight: true, unique: true + │ │ │ └── union spans: ["7a\x00\x02\x00\x03\x00\x01\x12d\x00\x01", "7a\x00\x02\x00\x03\x00\x01\x12d\x00\x01"] + │ │ └── span expression + │ │ ├── tight: true, unique: true + │ │ └── union spans: ["7a\x00\x02\x00\x03\x00\x01\x12e\x00\x01", "7a\x00\x02\x00\x03\x00\x01\x12e\x00\x01"] + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ └── scan t@j_idx + │ ├── columns: k:1(int!null) j_inverted_key:4(jsonb!null) + │ ├── inverted constraint: /4/1 + │ │ └── spans + │ │ ├── ["7a\x00\x02\x00\x03\x00\x01\x12b\x00\x01", "7a\x00\x02\x00\x03\x00\x01\x12b\x00\x01"] + │ │ ├── ["7a\x00\x02\x00\x03\x00\x01\x12c\x00\x01", "7a\x00\x02\x00\x03\x00\x01\x12c\x00\x01"] + │ │ ├── ["7a\x00\x02\x00\x03\x00\x01\x12d\x00\x01", "7a\x00\x02\x00\x03\x00\x01\x12d\x00\x01"] + │ │ └── ["7a\x00\x02\x00\x03\x00\x01\x12e\x00\x01", "7a\x00\x02\x00\x03\x00\x01\x12e\x00\x01"] + │ ├── stats: [rows=2e-07, distinct(1)=2e-07, null(1)=0, distinct(4)=2e-07, null(4)=0] + │ │ histogram(4)= + │ ├── key: (1) + │ └── fd: (1)-->(4) + └── filters + └── (j:2->'a') = '["b", "c", "d", "e"]' [type=bool, outer=(2), immutable] + +# A query with the fetch val operator with an object on the right side +# that contains an array uses the inverted index, and the inverted expression +# is not tight. +opt +SELECT * FROM t WHERE j->'a' = '{"b": ["c", "d", "e"]}' +---- +select + ├── columns: k:1(int!null) j:2(jsonb) + ├── immutable + ├── stats: [rows=222.222222] + ├── key: (1) + ├── fd: (1)-->(2) + ├── index-join t + │ ├── columns: k:1(int!null) j:2(jsonb) + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ ├── fd: (1)-->(2) + │ └── inverted-filter + │ ├── columns: k:1(int!null) + │ ├── inverted expression: /4 + │ │ ├── tight: false, unique: true + │ │ ├── union spans: empty + │ │ └── INTERSECTION + │ │ ├── span expression + │ │ │ ├── tight: true, unique: true + │ │ │ ├── union spans: empty + │ │ │ └── INTERSECTION + │ │ │ ├── span expression + │ │ │ │ ├── tight: true, unique: true + │ │ │ │ └── union spans: ["7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12c\x00\x01", "7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12c\x00\x01"] + │ │ │ └── span expression + │ │ │ ├── tight: true, unique: true + │ │ │ └── union spans: ["7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12d\x00\x01", "7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12d\x00\x01"] + │ │ └── span expression + │ │ ├── tight: true, unique: true + │ │ └── union spans: ["7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12e\x00\x01", "7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12e\x00\x01"] + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ └── scan t@j_idx + │ ├── columns: k:1(int!null) j_inverted_key:4(jsonb!null) + │ ├── inverted constraint: /4/1 + │ │ └── spans + │ │ ├── ["7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12c\x00\x01", "7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12c\x00\x01"] + │ │ ├── ["7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12d\x00\x01", "7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12d\x00\x01"] + │ │ └── ["7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12e\x00\x01", "7a\x00\x02b\x00\x02\x00\x03\x00\x01\x12e\x00\x01"] + │ ├── stats: [rows=2e-07, distinct(1)=2e-07, null(1)=0, distinct(4)=2e-07, null(4)=0] + │ │ histogram(4)= + │ ├── key: (1) + │ └── fd: (1)-->(4) + └── filters + └── (j:2->'a') = '{"b": ["c", "d", "e"]}' [type=bool, outer=(2), immutable] + +# A query with the fetch val operator with empty array on the right side +# uses the inverted index, and the inverted expression is not tight. +opt +SELECT * FROM t WHERE j->'a' = '[]' +---- +select + ├── columns: k:1(int!null) j:2(jsonb) + ├── immutable + ├── stats: [rows=222.222222] + ├── key: (1) + ├── fd: (1)-->(2) + ├── index-join t + │ ├── columns: k:1(int!null) j:2(jsonb) + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ ├── fd: (1)-->(2) + │ └── inverted-filter + │ ├── columns: k:1(int!null) + │ ├── inverted expression: /4 + │ │ ├── tight: false, unique: false + │ │ └── union spans + │ │ ├── ["7a\x00\x018", "7a\x00\x018"] + │ │ └── ["7a\x00\x02\x00\x03", "7a\x00\x02\x00\x03"] + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ └── scan t@j_idx + │ ├── columns: k:1(int!null) j_inverted_key:4(jsonb!null) + │ ├── inverted constraint: /4/1 + │ │ └── spans + │ │ ├── ["7a\x00\x018", "7a\x00\x018"] + │ │ └── ["7a\x00\x02\x00\x03", "7a\x00\x02\x00\x03"] + │ ├── stats: [rows=2e-07, distinct(1)=2e-07, null(1)=0, distinct(4)=2e-07, null(4)=0] + │ │ histogram(4)= + │ ├── key: (1) + │ └── fd: (1)-->(4) + └── filters + └── (j:2->'a') = '[]' [type=bool, outer=(2), immutable] + +# A query with the fetch val operator with an empty object on the right side +# uses the inverted index, and the inverted expression is not tight. +opt +SELECT * FROM t WHERE j->'a' = '{}' +---- +select + ├── columns: k:1(int!null) j:2(jsonb) + ├── immutable + ├── stats: [rows=222.222222] + ├── key: (1) + ├── fd: (1)-->(2) + ├── index-join t + │ ├── columns: k:1(int!null) j:2(jsonb) + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ ├── fd: (1)-->(2) + │ └── inverted-filter + │ ├── columns: k:1(int!null) + │ ├── inverted expression: /4 + │ │ ├── tight: false, unique: false + │ │ └── union spans + │ │ ├── ["7a\x00\x019", "7a\x00\x019"] + │ │ └── ["7a\x00\x02\x00\xff", "7a\x00\x03") + │ ├── stats: [rows=2e-07] + │ ├── key: (1) + │ └── scan t@j_idx + │ ├── columns: k:1(int!null) j_inverted_key:4(jsonb!null) + │ ├── inverted constraint: /4/1 + │ │ └── spans + │ │ ├── ["7a\x00\x019", "7a\x00\x019"] + │ │ └── ["7a\x00\x02\x00\xff", "7a\x00\x03") + │ ├── stats: [rows=2e-07, distinct(1)=2e-07, null(1)=0, distinct(4)=2e-07, null(4)=0] + │ │ histogram(4)= + │ ├── key: (1) + │ └── fd: (1)-->(4) + └── filters + └── (j:2->'a') = '{}' [type=bool, outer=(2), immutable]