opt: inverted-index accelerate filters of the form j->0 @> '{"b": "c"}

Previously, the optimizer did not plan inverted index scans for filters having an integer as the index for the fetch value in a filter along with a containment operator. To address this, we now build JSON arrays from fetch value expressions with integer indexes. From these JSON arrays, inverted spans are built for constraining scans over inverted indexes. With these changes chains of both integer and string fetch value operators are now supported alongside a containment operator. (e.g., j->0 @> '{"b": "c"}'). Epic: CRDB-3301 Fixes: cockroachdb#94667 Release note (performance improvement): The optimizer now plans inverted index scans for queries that filter by JSON fetch value operators (->) with integer indices alongside containtment operators, e.g, json_col->0 @> '{"b": "c"}'.
Shivs11 · Jan 30, 2023 · b3e0323 · b3e0323
1 parent 10ef5d9
commit b3e0323
Show file tree

Hide file tree

Showing 4 changed files with 465 additions and 49 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/inverted_index b/pkg/sql/logictest/testdata/logic_test/inverted_index
@@ -806,7 +806,96 @@ INSERT INTO f VALUES
   (36, '[[]]'),
   (37, '[{"a": [0, "b"]}, null, 1]'),
   (38, '[[0, 1, 2], {"b": "c"}]'),
-  (39, '[[0, [1, 2]]]')
+  (39, '[[0, [1, 2]]]'),
+  (40, '[[0, 1, 2]]'),
+  (41, '[[{"a": {"b": []}}]]')
+
+query T
+SELECT j FROM f@i WHERE j->0 @> '[0, 1, 2, 3]'
+----
+
+query T
+SELECT j FROM f@i WHERE j->0 @> '[0]'
+----
+[[0, 1, 2], {"b": "c"}]
+[[0, [1, 2]]]
+[[0, 1, 2]]
+
+query T
+SELECT j FROM f@i WHERE j->0->1 @> '[1, 2, 3]'
+----
+
+query T
+SELECT j FROM f@i WHERE j->0->1 @> '[1, 2]'
+----
+[[0, [1, 2]]]
+
+query T
+SELECT j FROM f@i WHERE j->0 @> '{"a": {}}'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0 @> '{"a": {"b": "c"}}'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0->1 @> '{"a": {"b": []}}'
+----
+[[{"a": {"b": []}}]]
+
+query T
+SELECT j FROM f@i WHERE j->'a'->0 @> '1'
+----
+{"a": [1, 2]}
+{"a": [1, 2, null]}
+
+query T
+SELECT j FROM f@i WHERE j->0->'a' @> '{"b": "c"}'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '[1, 2, 3]'
+----
+[1, 2, {"b": "c"}]
+[[]]
+
+query T
+SELECT j FROM f@i WHERE j->1 <@ '[1, 2, 3]'
+----
+[1, 2, {"b": "c"}]
+
+query T
+SELECT j FROM f@i WHERE j->0->0 <@ '[1, 2, 3]'
+----
+
+query T
+SELECT j FROM f@i WHERE j->2 <@ '["d", "e"]'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '{"a": {"b": "c"}}'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '["a", "b"]'
+----
+["a"]
+[[]]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '"a"'
+----
+["a"]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '1'
+----
+[1, 2, {"b": "c"}]
 
 query T
 SELECT j FROM f@i WHERE j->0 = '[]' ORDER BY k
@@ -873,6 +962,7 @@ SELECT j FROM f@i WHERE j->0->0 = '0' ORDER BY k
 ----
 [[0, 1, 2], {"b": "c"}]
 [[0, [1, 2]]]
+[[0, 1, 2]]
 
 query T
 SELECT j FROM f@i WHERE j->0->1 = '[1, 2]' ORDER BY k

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/inverted_index b/pkg/sql/opt/exec/execbuilder/testdata/inverted_index
@@ -1484,26 +1484,27 @@ vectorized: true
               right columns: (a, b_inverted_key)
               right fixed values: 1 column
 
-# Inverted indices won't be used for queries of the form
+# Inverted indices will be used for queries of the form
 # b->0 @> '{"b": "c"}'
 query T
 EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '{"b": "c"}'
 ----
 distribution: local
 vectorized: true
 ·
-• filter
+• index join
 │ columns: (b)
 │ estimated row count: 111 (missing stats)
-│ filter: (b->0) @> '{"b": "c"}'
+│ table: d@d_pkey
+│ key columns: a
 │
 └── • scan
-      columns: (b)
-      estimated row count: 1,000 (missing stats)
-      table: d@d_pkey
-      spans: FULL SCAN
+      columns: (a)
+      estimated row count: 111 (missing stats)
+      table: d@foo_inv
+      spans: /Arr/"b"/"c"-/Arr/"b"/"c"/PrefixEnd
 
-# Inverted indices won't be used for queries of the form
+# Inverted indices will be used for queries of the form
 # b->0 <@ '{"b": "c"}'
 query T
 EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '{"b": "c"}'
@@ -1516,11 +1517,178 @@ vectorized: true
 │ estimated row count: 333 (missing stats)
 │ filter: (b->0) <@ '{"b": "c"}'
 │
-└── • scan
-      columns: (b)
-      estimated row count: 1,000 (missing stats)
-      table: d@d_pkey
-      spans: FULL SCAN
+└── • index join
+    │ columns: (b)
+    │ estimated row count: 111 (missing stats)
+    │ table: d@d_pkey
+    │ key columns: a
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • inverted filter
+            │ columns: (a, b_inverted_key)
+            │ estimated row count: 111 (missing stats)
+            │ inverted column: b_inverted_key
+            │ num spans: 3
+            │
+            └── • scan
+                  columns: (a, b_inverted_key)
+                  estimated row count: 111 (missing stats)
+                  table: d@foo_inv
+                  spans: /[]-/{} /Arr/{}-/Arr/{}/PrefixEnd /Arr/"b"/"c"-/Arr/"b"/"c"/PrefixEnd
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '1'
+----
+distribution: local
+vectorized: true
+·
+• filter
+│ columns: (b)
+│ estimated row count: 333 (missing stats)
+│ filter: (b->0) <@ '1'
+│
+└── • index join
+    │ columns: (b)
+    │ estimated row count: 111 (missing stats)
+    │ table: d@d_pkey
+    │ key columns: a
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • inverted filter
+            │ columns: (a, b_inverted_key)
+            │ estimated row count: 111 (missing stats)
+            │ inverted column: b_inverted_key
+            │ num spans: 3
+            │
+            └── • scan
+                  columns: (a, b_inverted_key)
+                  estimated row count: 111 (missing stats)
+                  table: d@foo_inv
+                  spans: /1-/1/PrefixEnd /[]-/{} /Arr/1-/Arr/1/PrefixEnd
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '1'
+----
+distribution: local
+vectorized: true
+·
+• index join
+│ columns: (b)
+│ estimated row count: 111 (missing stats)
+│ table: d@d_pkey
+│ key columns: a
+│
+└── • project
+    │ columns: (a)
+    │
+    └── • inverted filter
+        │ columns: (a, b_inverted_key)
+        │ estimated row count: 111 (missing stats)
+        │ inverted column: b_inverted_key
+        │ num spans: 2
+        │
+        └── • scan
+              columns: (a, b_inverted_key)
+              estimated row count: 111 (missing stats)
+              table: d@foo_inv
+              spans: /Arr/1-/Arr/1/PrefixEnd /Arr/Arr/1-/Arr/Arr/1/PrefixEnd
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '[1, 2]'
+----
+distribution: local
+vectorized: true
+·
+• project
+│ columns: (b)
+│
+└── • lookup join (inner)
+    │ columns: (a, b)
+    │ estimated row count: 12 (missing stats)
+    │ table: d@d_pkey
+    │ equality: (a) = (a)
+    │ equality cols are key
+    │ pred: (b->0) @> '[1, 2]'
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • zigzag join
+              columns: (a, b_inverted_key, a, b_inverted_key)
+              estimated row count: 12 (missing stats)
+              left table: d@foo_inv
+              left columns: (a, b_inverted_key)
+              left fixed values: 1 column
+              right table: d@foo_inv
+              right columns: (a, b_inverted_key)
+              right fixed values: 1 column
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '[1, 2]'
+----
+distribution: local
+vectorized: true
+·
+• filter
+│ columns: (b)
+│ estimated row count: 333 (missing stats)
+│ filter: (b->0) <@ '[1, 2]'
+│
+└── • index join
+    │ columns: (b)
+    │ estimated row count: 111 (missing stats)
+    │ table: d@d_pkey
+    │ key columns: a
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • inverted filter
+            │ columns: (a, b_inverted_key)
+            │ estimated row count: 111 (missing stats)
+            │ inverted column: b_inverted_key
+            │ num spans: 8
+            │
+            └── • scan
+                  columns: (a, b_inverted_key)
+                  estimated row count: 111 (missing stats)
+                  table: d@foo_inv
+                  spans: /1-/1/PrefixEnd /2-/2/PrefixEnd /[]-/{} /Arr/1-/Arr/1/PrefixEnd /Arr/2-/Arr/2/PrefixEnd /Arr/[]-/Arr/{} /Arr/Arr/1-/Arr/Arr/1/PrefixEnd /Arr/Arr/2-/Arr/Arr/2/PrefixEnd
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE (b->0 @> '[1, 2]') AND (b->1 <@ '[1]')
+----
+distribution: local
+vectorized: true
+·
+• project
+│ columns: (b)
+│
+└── • lookup join (inner)
+    │ columns: (a, b)
+    │ estimated row count: 4 (missing stats)
+    │ table: d@d_pkey
+    │ equality: (a) = (a)
+    │ equality cols are key
+    │ pred: ((b->0) @> '[1, 2]') AND ((b->1) <@ '[1]')
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • zigzag join
+              columns: (a, b_inverted_key, a, b_inverted_key)
+              estimated row count: 12 (missing stats)
+              left table: d@foo_inv
+              left columns: (a, b_inverted_key)
+              left fixed values: 1 column
+              right table: d@foo_inv
+              right columns: (a, b_inverted_key)
+              right fixed values: 1 column
+
 
 # Stats reflect the following, with some histogram buckets removed:
 # insert into d select g, '[1,2]' from generate_series(1,1000) g(g);

diff --git a/pkg/sql/opt/invertedidx/json_array.go b/pkg/sql/opt/invertedidx/json_array.go
@@ -612,14 +612,6 @@ func (j *jsonOrArrayFilterPlanner) extractJSONFetchValContainsCondition(
 		return inverted.NonInvertedColExpression{}
 	}
 
-	// Not using inverted indices, yet, for filters of the form
-	// j->0 @> '{"b": "c"}' or j->0 <@ '{"b": "c"}'
-	for i := range keys {
-		if _, ok := keys[i].(*tree.DString); !ok {
-			return inverted.NonInvertedColExpression{}
-		}
-	}
-
 	// Build a new JSON object with the collected keys and val.
 	obj := buildObject(keys, val.JSON)