Skip to content

Commit

Permalink
opt: inverted-index accelerate filters of the form j->0 @> '{"b": "c"}
Browse files Browse the repository at this point in the history
Previously, the optimizer did not plan inverted index scans for filters
having an integer as the index for the fetch value in a filter alongside
the "contains" or the "contained by" operator.

To address this, we now build JSON arrays from fetch value expressions
with integer indexes. From these JSON arrays, inverted spans are built
for constraining scans over inverted indexes. With these changes chains
of both integer and string fetch value operators are now supported
alongside the "contains" and the "contained by" operators.
(e.g., j->0 @> '{"b": "c"}' and j->0 <@ '{"b": "c"}').

Epic: CRDB-3301
Fixes: cockroachdb#94667

Release note (performance improvement): The optimizer now plans
inverted index scans for queries that filter by JSON fetch value
operators (->) with integer indices alongside the "contains" or
the "contained by" operators, e.g, json_col->0 @> '{"b": "c"}'
or json_col->0 <@ '{"b": "c"}'
  • Loading branch information
Shivs11 committed Jan 31, 2023
1 parent 10ef5d9 commit 6a3c3e3
Show file tree
Hide file tree
Showing 5 changed files with 621 additions and 46 deletions.
96 changes: 95 additions & 1 deletion pkg/sql/logictest/testdata/logic_test/inverted_index
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,100 @@ INSERT INTO f VALUES
(36, '[[]]'),
(37, '[{"a": [0, "b"]}, null, 1]'),
(38, '[[0, 1, 2], {"b": "c"}]'),
(39, '[[0, [1, 2]]]')
(39, '[[0, [1, 2]]]'),
(40, '[[0, 1, 2]]'),
(41, '[[{"a": {"b": []}}]]')

query T
SELECT j FROM f@i WHERE j->0 @> '[0, 1, 2, 3]'
----

query T
SELECT j FROM f@i WHERE j->0 @> '[0]'
----
[[0, 1, 2], {"b": "c"}]
[[0, [1, 2]]]
[[0, 1, 2]]

query T
SELECT j FROM f@i WHERE j->0->1 @> '[1, 2, 3]'
----

query T
SELECT j FROM f@i WHERE j->0->1 @> '[1, 2]'
----
[[0, [1, 2]]]

query T
SELECT j FROM f@i WHERE j->0 @> '{"a": {}}'
----
[{"a": {"b": "c"}}, "d", "e"]

query T
SELECT j FROM f@i WHERE j->0 @> '{"a": {"b": "c"}}'
----
[{"a": {"b": "c"}}, "d", "e"]

query T
SELECT j FROM f@i WHERE j->0->1 @> '{"a": {"b": []}}'
----

query T
SELECT j FROM f@i WHERE j->0->0 @> '{"a": {"b": []}}'
----
[[{"a": {"b": []}}]]

query T
SELECT j FROM f@i WHERE j->'a'->0 @> '1'
----
{"a": [1, 2]}
{"a": [1, 2, null]}

query T
SELECT j FROM f@i WHERE j->0->'a' @> '{"b": "c"}'
----
[{"a": {"b": "c"}}, "d", "e"]

query T
SELECT j FROM f@i WHERE j->0 <@ '[1, 2, 3]'
----
[1, 2, {"b": "c"}]
[[]]

query T
SELECT j FROM f@i WHERE j->1 <@ '[1, 2, 3]'
----
[1, 2, {"b": "c"}]

query T
SELECT j FROM f@i WHERE j->0->0 <@ '[1, 2, 3]'
----

query T
SELECT j FROM f@i WHERE j->2 <@ '["d", "e"]'
----
[{"a": {"b": "c"}}, "d", "e"]

query T
SELECT j FROM f@i WHERE j->0 <@ '{"a": {"b": "c"}}'
----
[{"a": {"b": "c"}}, "d", "e"]

query T
SELECT j FROM f@i WHERE j->0 <@ '["a", "b"]'
----
["a"]
[[]]

query T
SELECT j FROM f@i WHERE j->0 <@ '"a"'
----
["a"]

query T
SELECT j FROM f@i WHERE j->0 <@ '1'
----
[1, 2, {"b": "c"}]

query T
SELECT j FROM f@i WHERE j->0 = '[]' ORDER BY k
Expand Down Expand Up @@ -873,6 +966,7 @@ SELECT j FROM f@i WHERE j->0->0 = '0' ORDER BY k
----
[[0, 1, 2], {"b": "c"}]
[[0, [1, 2]]]
[[0, 1, 2]]

query T
SELECT j FROM f@i WHERE j->0->1 = '[1, 2]' ORDER BY k
Expand Down
194 changes: 181 additions & 13 deletions pkg/sql/opt/exec/execbuilder/testdata/inverted_index
Original file line number Diff line number Diff line change
Expand Up @@ -1484,26 +1484,27 @@ vectorized: true
right columns: (a, b_inverted_key)
right fixed values: 1 column

# Inverted indices won't be used for queries of the form
# Inverted indices will be used for queries of the form
# b->0 @> '{"b": "c"}'
query T
EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '{"b": "c"}'
----
distribution: local
vectorized: true
·
filter
index join
│ columns: (b)
│ estimated row count: 111 (missing stats)
│ filter: (b->0) @> '{"b": "c"}'
│ table: d@d_pkey
│ key columns: a
└── • scan
columns: (b)
estimated row count: 1,000 (missing stats)
table: d@d_pkey
spans: FULL SCAN
columns: (a)
estimated row count: 111 (missing stats)
table: d@foo_inv
spans: /Arr/"b"/"c"-/Arr/"b"/"c"/PrefixEnd

# Inverted indices won't be used for queries of the form
# Inverted indices will be used for queries of the form
# b->0 <@ '{"b": "c"}'
query T
EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '{"b": "c"}'
Expand All @@ -1516,11 +1517,178 @@ vectorized: true
│ estimated row count: 333 (missing stats)
│ filter: (b->0) <@ '{"b": "c"}'
└── • scan
columns: (b)
estimated row count: 1,000 (missing stats)
table: d@d_pkey
spans: FULL SCAN
└── • index join
│ columns: (b)
│ estimated row count: 111 (missing stats)
│ table: d@d_pkey
│ key columns: a
└── • project
│ columns: (a)
└── • inverted filter
│ columns: (a, b_inverted_key)
│ estimated row count: 111 (missing stats)
│ inverted column: b_inverted_key
│ num spans: 3
└── • scan
columns: (a, b_inverted_key)
estimated row count: 111 (missing stats)
table: d@foo_inv
spans: /[]-/{} /Arr/{}-/Arr/{}/PrefixEnd /Arr/"b"/"c"-/Arr/"b"/"c"/PrefixEnd

query T
EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '1'
----
distribution: local
vectorized: true
·
• filter
│ columns: (b)
│ estimated row count: 333 (missing stats)
│ filter: (b->0) <@ '1'
└── • index join
│ columns: (b)
│ estimated row count: 111 (missing stats)
│ table: d@d_pkey
│ key columns: a
└── • project
│ columns: (a)
└── • inverted filter
│ columns: (a, b_inverted_key)
│ estimated row count: 111 (missing stats)
│ inverted column: b_inverted_key
│ num spans: 3
└── • scan
columns: (a, b_inverted_key)
estimated row count: 111 (missing stats)
table: d@foo_inv
spans: /1-/1/PrefixEnd /[]-/{} /Arr/1-/Arr/1/PrefixEnd

query T
EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '1'
----
distribution: local
vectorized: true
·
• index join
│ columns: (b)
│ estimated row count: 111 (missing stats)
│ table: d@d_pkey
│ key columns: a
└── • project
│ columns: (a)
└── • inverted filter
│ columns: (a, b_inverted_key)
│ estimated row count: 111 (missing stats)
│ inverted column: b_inverted_key
│ num spans: 2
└── • scan
columns: (a, b_inverted_key)
estimated row count: 111 (missing stats)
table: d@foo_inv
spans: /Arr/1-/Arr/1/PrefixEnd /Arr/Arr/1-/Arr/Arr/1/PrefixEnd

query T
EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '[1, 2]'
----
distribution: local
vectorized: true
·
• project
│ columns: (b)
└── • lookup join (inner)
│ columns: (a, b)
│ estimated row count: 12 (missing stats)
│ table: d@d_pkey
│ equality: (a) = (a)
│ equality cols are key
│ pred: (b->0) @> '[1, 2]'
└── • project
│ columns: (a)
└── • zigzag join
columns: (a, b_inverted_key, a, b_inverted_key)
estimated row count: 12 (missing stats)
left table: d@foo_inv
left columns: (a, b_inverted_key)
left fixed values: 1 column
right table: d@foo_inv
right columns: (a, b_inverted_key)
right fixed values: 1 column

query T
EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '[1, 2]'
----
distribution: local
vectorized: true
·
• filter
│ columns: (b)
│ estimated row count: 333 (missing stats)
│ filter: (b->0) <@ '[1, 2]'
└── • index join
│ columns: (b)
│ estimated row count: 111 (missing stats)
│ table: d@d_pkey
│ key columns: a
└── • project
│ columns: (a)
└── • inverted filter
│ columns: (a, b_inverted_key)
│ estimated row count: 111 (missing stats)
│ inverted column: b_inverted_key
│ num spans: 8
└── • scan
columns: (a, b_inverted_key)
estimated row count: 111 (missing stats)
table: d@foo_inv
spans: /1-/1/PrefixEnd /2-/2/PrefixEnd /[]-/{} /Arr/1-/Arr/1/PrefixEnd /Arr/2-/Arr/2/PrefixEnd /Arr/[]-/Arr/{} /Arr/Arr/1-/Arr/Arr/1/PrefixEnd /Arr/Arr/2-/Arr/Arr/2/PrefixEnd

query T
EXPLAIN (VERBOSE) SELECT b FROM d WHERE (b->0 @> '[1, 2]') AND (b->1 <@ '[1]')
----
distribution: local
vectorized: true
·
• project
│ columns: (b)
└── • lookup join (inner)
│ columns: (a, b)
│ estimated row count: 4 (missing stats)
│ table: d@d_pkey
│ equality: (a) = (a)
│ equality cols are key
│ pred: ((b->0) @> '[1, 2]') AND ((b->1) <@ '[1]')
└── • project
│ columns: (a)
└── • zigzag join
columns: (a, b_inverted_key, a, b_inverted_key)
estimated row count: 12 (missing stats)
left table: d@foo_inv
left columns: (a, b_inverted_key)
left fixed values: 1 column
right table: d@foo_inv
right columns: (a, b_inverted_key)
right fixed values: 1 column


# Stats reflect the following, with some histogram buckets removed:
# insert into d select g, '[1,2]' from generate_series(1,1000) g(g);
Expand Down
31 changes: 23 additions & 8 deletions pkg/sql/opt/invertedidx/json_array.go
Original file line number Diff line number Diff line change
Expand Up @@ -612,14 +612,6 @@ func (j *jsonOrArrayFilterPlanner) extractJSONFetchValContainsCondition(
return inverted.NonInvertedColExpression{}
}

// Not using inverted indices, yet, for filters of the form
// j->0 @> '{"b": "c"}' or j->0 <@ '{"b": "c"}'
for i := range keys {
if _, ok := keys[i].(*tree.DString); !ok {
return inverted.NonInvertedColExpression{}
}
}

// Build a new JSON object with the collected keys and val.
obj := buildObject(keys, val.JSON)

Expand Down Expand Up @@ -647,6 +639,29 @@ func (j *jsonOrArrayFilterPlanner) extractJSONFetchValContainsCondition(
invertedExpr = inverted.Or(invertedExpr, expr)
}
}

// If a key is of the type DInt, the InvertedExpression generated is
// not tight. This is because key encodings for JSON arrays don't
// contain the respective index positions for each of their elements.
// A JSON array of the form ["a", 31] will have the following encoding
// into an index key:
//
// 1/2/arr/a/pk1
// 1/2/arr/31/pk1
//
// where arr is an ARRAY type tag used to indicate that the next key is
// part of an array, 1 is the table id, 2 is the inverted index id and
// pk is a primary key of a row in the table. Since the array
// elements do not have their respective indices stored in
// the encoding, the original filter needs to be applied after the initial
// scan.
for i := range keys {
if _, ok := keys[i].(*tree.DInt); ok {
invertedExpr.SetNotTight()
break
}
}

return invertedExpr
}

Expand Down
Loading

0 comments on commit 6a3c3e3

Please sign in to comment.