opt: inverted-index accelerate filters of the form j->0 @> '{"b": "c"}

Previously, the optimizer did not plan inverted index scans for filters having an integer as the index for the fetch value in a filter alongside the "contains" or the "contained by" operator. To address this, we now build JSON arrays from fetch value expressions with integer indexes. From these JSON arrays, inverted spans are built for constraining scans over inverted indexes. With these changes chains of both integer and string fetch value operators are now supported alongside the "contains" and the "contained by" operators. (e.g., j->0 @> '{"b": "c"}' and j->0 <@ '{"b": "c"}'). Epic: CRDB-3301 Fixes: cockroachdb#94667 Release note (performance improvement): The optimizer now plans inverted index scans for queries that filter by JSON fetch value operators (->) with integer indices alongside the "contains" or the "contained by" operators, e.g, json_col->0 @> '{"b": "c"}' or json_col->0 <@ '{"b": "c"}'
Shivs11 · Jan 31, 2023 · 6a3c3e3 · 6a3c3e3
1 parent 10ef5d9
commit 6a3c3e3
Show file tree

Hide file tree

Showing 5 changed files with 621 additions and 46 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/inverted_index b/pkg/sql/logictest/testdata/logic_test/inverted_index
@@ -806,7 +806,100 @@ INSERT INTO f VALUES
   (36, '[[]]'),
   (37, '[{"a": [0, "b"]}, null, 1]'),
   (38, '[[0, 1, 2], {"b": "c"}]'),
-  (39, '[[0, [1, 2]]]')
+  (39, '[[0, [1, 2]]]'),
+  (40, '[[0, 1, 2]]'),
+  (41, '[[{"a": {"b": []}}]]')
+
+query T
+SELECT j FROM f@i WHERE j->0 @> '[0, 1, 2, 3]'
+----
+
+query T
+SELECT j FROM f@i WHERE j->0 @> '[0]'
+----
+[[0, 1, 2], {"b": "c"}]
+[[0, [1, 2]]]
+[[0, 1, 2]]
+
+query T
+SELECT j FROM f@i WHERE j->0->1 @> '[1, 2, 3]'
+----
+
+query T
+SELECT j FROM f@i WHERE j->0->1 @> '[1, 2]'
+----
+[[0, [1, 2]]]
+
+query T
+SELECT j FROM f@i WHERE j->0 @> '{"a": {}}'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0 @> '{"a": {"b": "c"}}'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0->1 @> '{"a": {"b": []}}'
+----
+
+query T
+SELECT j FROM f@i WHERE j->0->0 @> '{"a": {"b": []}}'
+----
+[[{"a": {"b": []}}]]
+
+query T
+SELECT j FROM f@i WHERE j->'a'->0 @> '1'
+----
+{"a": [1, 2]}
+{"a": [1, 2, null]}
+
+query T
+SELECT j FROM f@i WHERE j->0->'a' @> '{"b": "c"}'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '[1, 2, 3]'
+----
+[1, 2, {"b": "c"}]
+[[]]
+
+query T
+SELECT j FROM f@i WHERE j->1 <@ '[1, 2, 3]'
+----
+[1, 2, {"b": "c"}]
+
+query T
+SELECT j FROM f@i WHERE j->0->0 <@ '[1, 2, 3]'
+----
+
+query T
+SELECT j FROM f@i WHERE j->2 <@ '["d", "e"]'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '{"a": {"b": "c"}}'
+----
+[{"a": {"b": "c"}}, "d", "e"]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '["a", "b"]'
+----
+["a"]
+[[]]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '"a"'
+----
+["a"]
+
+query T
+SELECT j FROM f@i WHERE j->0 <@ '1'
+----
+[1, 2, {"b": "c"}]
 
 query T
 SELECT j FROM f@i WHERE j->0 = '[]' ORDER BY k
@@ -873,6 +966,7 @@ SELECT j FROM f@i WHERE j->0->0 = '0' ORDER BY k
 ----
 [[0, 1, 2], {"b": "c"}]
 [[0, [1, 2]]]
+[[0, 1, 2]]
 
 query T
 SELECT j FROM f@i WHERE j->0->1 = '[1, 2]' ORDER BY k

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/inverted_index b/pkg/sql/opt/exec/execbuilder/testdata/inverted_index
@@ -1484,26 +1484,27 @@ vectorized: true
               right columns: (a, b_inverted_key)
               right fixed values: 1 column
 
-# Inverted indices won't be used for queries of the form
+# Inverted indices will be used for queries of the form
 # b->0 @> '{"b": "c"}'
 query T
 EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '{"b": "c"}'
 ----
 distribution: local
 vectorized: true
 ·
-• filter
+• index join
 │ columns: (b)
 │ estimated row count: 111 (missing stats)
-│ filter: (b->0) @> '{"b": "c"}'
+│ table: d@d_pkey
+│ key columns: a
 │
 └── • scan
-      columns: (b)
-      estimated row count: 1,000 (missing stats)
-      table: d@d_pkey
-      spans: FULL SCAN
+      columns: (a)
+      estimated row count: 111 (missing stats)
+      table: d@foo_inv
+      spans: /Arr/"b"/"c"-/Arr/"b"/"c"/PrefixEnd
 
-# Inverted indices won't be used for queries of the form
+# Inverted indices will be used for queries of the form
 # b->0 <@ '{"b": "c"}'
 query T
 EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '{"b": "c"}'
@@ -1516,11 +1517,178 @@ vectorized: true
 │ estimated row count: 333 (missing stats)
 │ filter: (b->0) <@ '{"b": "c"}'
 │
-└── • scan
-      columns: (b)
-      estimated row count: 1,000 (missing stats)
-      table: d@d_pkey
-      spans: FULL SCAN
+└── • index join
+    │ columns: (b)
+    │ estimated row count: 111 (missing stats)
+    │ table: d@d_pkey
+    │ key columns: a
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • inverted filter
+            │ columns: (a, b_inverted_key)
+            │ estimated row count: 111 (missing stats)
+            │ inverted column: b_inverted_key
+            │ num spans: 3
+            │
+            └── • scan
+                  columns: (a, b_inverted_key)
+                  estimated row count: 111 (missing stats)
+                  table: d@foo_inv
+                  spans: /[]-/{} /Arr/{}-/Arr/{}/PrefixEnd /Arr/"b"/"c"-/Arr/"b"/"c"/PrefixEnd
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '1'
+----
+distribution: local
+vectorized: true
+·
+• filter
+│ columns: (b)
+│ estimated row count: 333 (missing stats)
+│ filter: (b->0) <@ '1'
+│
+└── • index join
+    │ columns: (b)
+    │ estimated row count: 111 (missing stats)
+    │ table: d@d_pkey
+    │ key columns: a
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • inverted filter
+            │ columns: (a, b_inverted_key)
+            │ estimated row count: 111 (missing stats)
+            │ inverted column: b_inverted_key
+            │ num spans: 3
+            │
+            └── • scan
+                  columns: (a, b_inverted_key)
+                  estimated row count: 111 (missing stats)
+                  table: d@foo_inv
+                  spans: /1-/1/PrefixEnd /[]-/{} /Arr/1-/Arr/1/PrefixEnd
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '1'
+----
+distribution: local
+vectorized: true
+·
+• index join
+│ columns: (b)
+│ estimated row count: 111 (missing stats)
+│ table: d@d_pkey
+│ key columns: a
+│
+└── • project
+    │ columns: (a)
+    │
+    └── • inverted filter
+        │ columns: (a, b_inverted_key)
+        │ estimated row count: 111 (missing stats)
+        │ inverted column: b_inverted_key
+        │ num spans: 2
+        │
+        └── • scan
+              columns: (a, b_inverted_key)
+              estimated row count: 111 (missing stats)
+              table: d@foo_inv
+              spans: /Arr/1-/Arr/1/PrefixEnd /Arr/Arr/1-/Arr/Arr/1/PrefixEnd
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 @> '[1, 2]'
+----
+distribution: local
+vectorized: true
+·
+• project
+│ columns: (b)
+│
+└── • lookup join (inner)
+    │ columns: (a, b)
+    │ estimated row count: 12 (missing stats)
+    │ table: d@d_pkey
+    │ equality: (a) = (a)
+    │ equality cols are key
+    │ pred: (b->0) @> '[1, 2]'
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • zigzag join
+              columns: (a, b_inverted_key, a, b_inverted_key)
+              estimated row count: 12 (missing stats)
+              left table: d@foo_inv
+              left columns: (a, b_inverted_key)
+              left fixed values: 1 column
+              right table: d@foo_inv
+              right columns: (a, b_inverted_key)
+              right fixed values: 1 column
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE b->0 <@ '[1, 2]'
+----
+distribution: local
+vectorized: true
+·
+• filter
+│ columns: (b)
+│ estimated row count: 333 (missing stats)
+│ filter: (b->0) <@ '[1, 2]'
+│
+└── • index join
+    │ columns: (b)
+    │ estimated row count: 111 (missing stats)
+    │ table: d@d_pkey
+    │ key columns: a
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • inverted filter
+            │ columns: (a, b_inverted_key)
+            │ estimated row count: 111 (missing stats)
+            │ inverted column: b_inverted_key
+            │ num spans: 8
+            │
+            └── • scan
+                  columns: (a, b_inverted_key)
+                  estimated row count: 111 (missing stats)
+                  table: d@foo_inv
+                  spans: /1-/1/PrefixEnd /2-/2/PrefixEnd /[]-/{} /Arr/1-/Arr/1/PrefixEnd /Arr/2-/Arr/2/PrefixEnd /Arr/[]-/Arr/{} /Arr/Arr/1-/Arr/Arr/1/PrefixEnd /Arr/Arr/2-/Arr/Arr/2/PrefixEnd
+
+query T
+EXPLAIN (VERBOSE) SELECT b FROM d WHERE (b->0 @> '[1, 2]') AND (b->1 <@ '[1]')
+----
+distribution: local
+vectorized: true
+·
+• project
+│ columns: (b)
+│
+└── • lookup join (inner)
+    │ columns: (a, b)
+    │ estimated row count: 4 (missing stats)
+    │ table: d@d_pkey
+    │ equality: (a) = (a)
+    │ equality cols are key
+    │ pred: ((b->0) @> '[1, 2]') AND ((b->1) <@ '[1]')
+    │
+    └── • project
+        │ columns: (a)
+        │
+        └── • zigzag join
+              columns: (a, b_inverted_key, a, b_inverted_key)
+              estimated row count: 12 (missing stats)
+              left table: d@foo_inv
+              left columns: (a, b_inverted_key)
+              left fixed values: 1 column
+              right table: d@foo_inv
+              right columns: (a, b_inverted_key)
+              right fixed values: 1 column
+
 
 # Stats reflect the following, with some histogram buckets removed:
 # insert into d select g, '[1,2]' from generate_series(1,1000) g(g);

diff --git a/pkg/sql/opt/invertedidx/json_array.go b/pkg/sql/opt/invertedidx/json_array.go
@@ -612,14 +612,6 @@ func (j *jsonOrArrayFilterPlanner) extractJSONFetchValContainsCondition(
 		return inverted.NonInvertedColExpression{}
 	}
 
-	// Not using inverted indices, yet, for filters of the form
-	// j->0 @> '{"b": "c"}' or j->0 <@ '{"b": "c"}'
-	for i := range keys {
-		if _, ok := keys[i].(*tree.DString); !ok {
-			return inverted.NonInvertedColExpression{}
-		}
-	}
-
 	// Build a new JSON object with the collected keys and val.
 	obj := buildObject(keys, val.JSON)
 
@@ -647,6 +639,29 @@ func (j *jsonOrArrayFilterPlanner) extractJSONFetchValContainsCondition(
 			invertedExpr = inverted.Or(invertedExpr, expr)
 		}
 	}
+
+	// If a key is of the type DInt, the InvertedExpression generated is
+	// not tight. This is because key encodings for JSON arrays don't
+	// contain the respective index positions for each of their elements.
+	// A JSON array of the form ["a", 31] will have the following encoding
+	// into an index key:
+	//
+	// 1/2/arr/a/pk1
+	// 1/2/arr/31/pk1
+	//
+	// where arr is an ARRAY type tag used to indicate that the next key is
+	// part of an array, 1 is the table id, 2 is the inverted index id and
+	// pk is a primary key of a row in the table. Since the array
+	// elements do not have their respective indices stored in
+	// the encoding, the original filter needs to be applied after the initial
+	// scan.
+	for i := range keys {
+		if _, ok := keys[i].(*tree.DInt); ok {
+			invertedExpr.SetNotTight()
+			break
+		}
+	}
+
 	return invertedExpr
 }