From c378ac44c26267ec5907c1aa80f1683e4a902176 Mon Sep 17 00:00:00 2001 From: Eric Harmeling Date: Tue, 1 Sep 2020 14:34:47 -0400 Subject: [PATCH] Partial indexes --- _includes/sidebar-data-v20.2.json | 6 + .../v20.2/sql/diagrams/create_index.html | 167 ++++---- _includes/v20.2/sql/diagrams/index_def.html | 58 +-- v20.2/create-index.md | 3 +- v20.2/create-table.md | 1 + v20.2/indexes.md | 2 + v20.2/partial-indexes.md | 391 ++++++++++++++++++ v20.2/sql-feature-support.md | 6 +- 8 files changed, 524 insertions(+), 110 deletions(-) create mode 100644 v20.2/partial-indexes.md diff --git a/_includes/sidebar-data-v20.2.json b/_includes/sidebar-data-v20.2.json index 20d86bb61b2..8b55b6c5abd 100644 --- a/_includes/sidebar-data-v20.2.json +++ b/_includes/sidebar-data-v20.2.json @@ -1949,6 +1949,12 @@ "/${VERSION}/inverted-indexes.html" ] }, + { + "title": "Partial Indexes", + "urls": [ + "/${VERSION}/partial-indexes.html" + ] + }, { "title": "Column Families", "urls": [ diff --git a/_includes/v20.2/sql/diagrams/create_index.html b/_includes/v20.2/sql/diagrams/create_index.html index efef77b3721..be8cf45d742 100644 --- a/_includes/v20.2/sql/diagrams/create_index.html +++ b/_includes/v20.2/sql/diagrams/create_index.html @@ -1,91 +1,91 @@ -
- - - - -CREATE - - -UNIQUE - - -INDEX - - -CONCURRENTLY +
+ + + + +CREATE + + +UNIQUE + + +INDEX + + +CONCURRENTLY - - -opt_index_name + + +opt_index_name - - -IF - - -NOT - - -EXISTS + + +IF + + +NOT + + +EXISTS - - -index_name + + +index_name - - -ON + + +ON - - -table_name + + +table_name - - -USING + + +USING - - -name + + +name - - -( + + +( - - -a_expr + + +a_expr - - -ASC - - -DESC - - -, - - -) - - -USING - - -HASH - - -WITH - - -BUCKET_COUNT - - -= - - -n_buckets + + +ASC + + +DESC + + +, + + +) + + +USING + + +HASH + + +WITH + + +BUCKET_COUNT + + += + + +n_buckets COVERING @@ -116,7 +116,12 @@ opt_partition_by - - - + + + +opt_where_clause + + + +
diff --git a/_includes/v20.2/sql/diagrams/index_def.html b/_includes/v20.2/sql/diagrams/index_def.html index 52e04841c00..ccb29849b31 100644 --- a/_includes/v20.2/sql/diagrams/index_def.html +++ b/_includes/v20.2/sql/diagrams/index_def.html @@ -1,4 +1,4 @@ -
+
@@ -50,6 +50,9 @@ STORING + + +INCLUDE ( @@ -71,32 +74,37 @@ opt_partition_by - - -INVERTED - - -INDEX + + +INVERTED + + +INDEX - - -name + + +name - - -( + + +( - - -index_elem + + +index_elem - - -, - - -) - - - + + +, + + +) + + + +opt_where_clause + + + +
diff --git a/v20.2/create-index.md b/v20.2/create-index.md index 0d8136293d8..e82fa50db1b 100644 --- a/v20.2/create-index.md +++ b/v20.2/create-index.md @@ -12,7 +12,7 @@ The following types cannot be included in an index key, but can be stored (and u - [`ARRAY`](array.html) - The computed [`TUPLE`](scalar-expressions.html#tuple-constructor) type, even if it is constructed from indexed fields -To create an index on the schemaless data in a [`JSONB`](jsonb.html) column, use an [inverted index](inverted-indexes.html). +To create an index on the schemaless data in a [`JSONB`](jsonb.html) column, or on the data in an [`ARRAY`](array.html), use an [inverted index](inverted-indexes.html). {{site.data.alerts.callout_info}} Indexes are automatically created for a table's [`PRIMARY KEY`](primary-key.html) and [`UNIQUE`](unique.html) columns. When querying a table, CockroachDB uses the fastest index. For more information about that process, see [Index Selection in CockroachDB](https://www.cockroachlabs.com/blog/index-selection-cockroachdb-2/). @@ -49,6 +49,7 @@ Parameter | Description `STORING ...`| Store (but do not sort) each column whose name you include.

For information on when to use `STORING`, see [Store Columns](#store-columns). Note that columns that are part of a table's [`PRIMARY KEY`](primary-key.html) cannot be specified as `STORING` columns in secondary indexes on the table.

`COVERING` and `INCLUDE` are aliases for `STORING` and work identically. `opt_interleave` | You can potentially optimize query performance by [interleaving indexes](interleave-in-parent.html), which changes how CockroachDB stores your data.
{{site.data.alerts.callout_info}}[Hash-sharded indexes](indexes.html#hash-sharded-indexes) cannot be interleaved.{{site.data.alerts.end}} `opt_partition_by` | An [enterprise-only](enterprise-licensing.html) option that lets you [define index partitions at the row level](partitioning.html). +`opt_where_clause` | New in v20.2: An optional `WHERE` clause that defines the predicate boolean expression of a [partial index](partial-indexes.html). `USING HASH WITH BUCKET COUNT` | Creates a [hash-sharded index](indexes.html#hash-sharded-indexes) with `n_buckets` number of buckets.
{{site.data.alerts.callout_info}}To enable hash-sharded indexes, set the `experimental_enable_hash_sharded_indexes` [session variable](set-vars.html) to `on`.{{site.data.alerts.end}} `CONCURRENTLY` | Optional, no-op syntax for PostgreSQL compatibility. All indexes are created concurrently in CockroachDB. diff --git a/v20.2/create-table.md b/v20.2/create-table.md index d305856c885..9cfcef2ee66 100644 --- a/v20.2/create-table.md +++ b/v20.2/create-table.md @@ -96,6 +96,7 @@ Parameter | Description `LIKE table_name like_table_option_list` | New in v20.2: Create a new table based on the schema of an existing table, using supported specifiers. For details, see [Create a table like an existing table](#create-a-table-like-an-existing-table). For examples, see [Create a new table from an existing one](#create-a-new-table-from-an-existing-one). `opt_interleave` | You can potentially optimize query performance by [interleaving tables](interleave-in-parent.html), which changes how CockroachDB stores your data.
{{site.data.alerts.callout_info}}[Hash-sharded indexes](indexes.html#hash-sharded-indexes) cannot be interleaved.{{site.data.alerts.end}} `opt_partition_by` | An [enterprise-only](enterprise-licensing.html) option that lets you define table partitions at the row level. You can define table partitions by list or by range. See [Define Table Partitions](partitioning.html) for more information. +`opt_where_clause` | New in v20.2: An optional `WHERE` clause that defines the predicate boolean expression of a [partial index](partial-indexes.html). ## Table-level replication diff --git a/v20.2/indexes.md b/v20.2/indexes.md index 701ae9102c9..01bd72a047a 100644 --- a/v20.2/indexes.md +++ b/v20.2/indexes.md @@ -15,6 +15,8 @@ After a column is indexed, SQL can easily filter its values using the index inst For example, if you index an `INT` column and then filter it WHERE <indexed column> = 10, SQL can use the index to find values starting at 10 but less than 11. In contrast, without an index, SQL would have to evaluate _every_ row in the table for values equaling 10. This is also known as a "full table scan", and it can be very bad for query performance. +New in v20.2: You can also create an index on a subset of rows. This type of index is called a partial index. For more information, see [Partial indexes](partial-indexes.html). + ### Creation Each table automatically has an index created called `primary`, which indexes either its [primary key](primary-key.html) or—if there is no primary key—a unique value for each row known as `rowid`. We recommend always defining a primary key because the index it creates provides much better performance than letting CockroachDB use `rowid`. diff --git a/v20.2/partial-indexes.md b/v20.2/partial-indexes.md new file mode 100644 index 00000000000..cd89d7f79bb --- /dev/null +++ b/v20.2/partial-indexes.md @@ -0,0 +1,391 @@ +--- +title: Partial Indexes +summary: Partial indexes allow you to specify a subset of rows and columns to add to an index. +toc: true +--- + +New in v20.2: Partial indexes allow you to specify a subset of rows and columns to add to an [index](indexes.html). Partial indexes include the subset of rows in a table that evaluate to true on a boolean *predicate expression* (i.e., a `WHERE` filter) defined at [index creation](#creation). + +## How do partial indexes work? + +When you create a partial index, CockroachDB "indexes" the columns and rows that evaluate to true on the index's boolean predicate expression, creating a sorted copy of the subset of row values, without modifying the values in the table itself. + +CockroachDB can use a partial index to efficiently execute queries on any subset of rows implied by the partial index. When possible, the [cost-based optimizer](cost-based-optimizer.html) creates a plan that limits table scans on rows implied by the partial index to just the rows in the index. It also limits index rewrites to fewer rows. + +Partial indexes can improve cluster performance in a number of ways: + +- They contain fewer rows than full indexes, making them less expensive to create and store on a cluster. +- Read queries on rows included in a partial index only scan the rows in the partial index. This contrasts with queries on columns in full indexes, which must scan all rows in the indexed column. +- Write queries on tables with a partial index only perform an index write when the rows inserted satisfy the partial index predicate. This contrasts with write queries on tables with full indexes, which incur the overhead of a full index write when the rows inserted modify an indexed column. + +{{site.data.alerts.callout_info}} +When a query on a table with a partial index has a filter expression, the [cost-based optimizer](cost-based-optimizer.html) attempts to prove that the filter implies the partial index predicate. It is not guaranteed that the optimizer can prove the implication of arbitrarily complex expressions. Although unlikely, it is possible that a filter implies a predicate, but the optimizer cannot prove the implication. +{{site.data.alerts.end}} + +## Creation + +To create a partial index, use a [`CREATE INDEX`](create-index.html) statement, with a standard `WHERE` clause defining a predicate expression. + +For example, to define a partial index on columns `a` and `b` of table `t`, filtering on rows in column `c` greater than 5: + +~~~ sql +> CREATE INDEX ON t (a, b) WHERE c > 5; +~~~ + +The following queries use the partial index: + +~~~ sql +> SELECT a, b FROM t WHERE c > 5; +~~~ + +~~~ sql +> SELECT * FROM t WHERE c = 10; +~~~ + +The following queries do *not* use the partial index: + +~~~ sql +> SELECT a, b FROM t; +~~~ + +~~~ sql +> SELECT * FROM t WHERE c = 3; +~~~ + +When defining the predicate expression, note that: + +- The predicate expression must result in a [boolean](bool.html). +- The predicate expression can only refer to columns in the table being indexed. +- [Functions](functions-and-operators.html) used in predicates must be immutable. For example, the `now()` function is not allowed in predicates because its value depends on more than its arguments. + +## Unique partial indexes + +You can enforce [uniqueness](unique.html) on a subset of rows with `CREATE UNIQUE INDEX ... WHERE ...`. + +For example, to define a unique partial index on columns `a` and `b` for table `t`, filtering on rows in column `d` equal to `'x'`: + +~~~ sql +> CREATE UNIQUE INDEX ON t (a, b) WHERE d = 'x'; +~~~ + +This creates a partial index and a `UNIQUE` constraint on the subset of rows where `d` is equal to `'x'`. + +For another example, see [Create a partial index that enforces uniqueness on a subset of rows](#create-a-partial-index-that-enforces-uniqueness-on-a-subset-of-rows). + +{{site.data.alerts.callout_success}} +When [inserted values](insert.html) conflict with a `UNIQUE` constraint on one or more columns, CockroachDB normally returns an error. We recommend adding an [`ON CONFLICT`](insert.html#on-conflict-clause) clause to all `INSERT` statements that might conflict with rows in the unique index. +{{site.data.alerts.end}} + +## Index hints + +You can force queries [to use a specific partial index](table-expressions.html#force-index-selection) (also known as "index hinting"), like you can with full indexes. However, unlike full indexes, partial indexes cannot be used to satisfy all queries. If a query's filter implies the partial index predicate expression, the partial index will be used in the query plan. If not, an error will be returned. + +## Known limitations + +- CockroachDB does not currently support partial [inverted indexes](inverted-indexes.html). See [tracking issue](https://github.com/cockroachdb/cockroach/issues/50952). +- CockroachDB does not currently support [`IMPORT`](import.html) statements on tables with partial indexes. See [tracking issue](https://github.com/cockroachdb/cockroach/issues/50225). +- CockroachDB does not currently support multiple arbiter indexes for `INSERT ON CONFLICT DO UPDATE`, and will return an error if there are multiple unique or exclusion constraints matching the `ON CONFLICT DO UPDATE` specification. See [tracking issue](https://github.com/cockroachdb/cockroach/issues/53170). + +## Examples + +### Setup + +The following examples use MovR, a fictional vehicle-sharing application, to demonstrate CockroachDB SQL statements. For more information about the MovR example application and dataset, see [MovR: A Global Vehicle-sharing App](movr.html). + +To follow along, run [`cockroach demo`](cockroach-demo.html) to start a temporary, in-memory cluster with the `movr` workload: + +{% include copy-clipboard.html %} +~~~ shell +$ cockroach demo --with-load +~~~ + +### Create an index on a subset of rows + +Suppose that you want to query the subset of `rides` with a `revenue` greater than 80. + +{% include copy-clipboard.html %} +~~~ sql +> SELECT * FROM [SHOW TABLES] WHERE table_name='rides'; +~~~ + +~~~ + schema_name | table_name | type | estimated_row_count +--------------+------------+-------+---------------------- + public | rides | table | 29446 +(1 row) +~~~ + +Without a partial index, querying the `rides` table with a `WHERE revenue > 80` clause will scan the entire table. To see the plan for such a query, you can use an [`EXPLAIN` statement](explain.html): + +{% include copy-clipboard.html %} +~~~ sql +> EXPLAIN SELECT * FROM rides WHERE revenue > 80; +~~~ + +~~~ + tree | field | description +------------+---------------------+---------------- + | distribution | full + | vectorized | true + filter | | + │ | filter | revenue > 80 + └── scan | | + | estimated row count | 29446 + | table | rides@primary + | spans | FULL SCAN +(8 rows) +~~~ + +The `description` column for the `estimated row count` lists the number of rows that the query plan will scan (in this case, the entire table row count of 31337). The `description` for the `table` lists the index used in the scan (in this case, the [primary key index](primary-key.html)). + +To limit the number of rows scanned to just the rows that you are querying, you can create a partial index: + +{% include copy-clipboard.html %} +~~~ sql +> CREATE INDEX ON rides (city, revenue) WHERE revenue > 80; +~~~ + +{% include copy-clipboard.html %} +~~~ sql +> SHOW INDEXES FROM rides; +~~~ + +~~~ + table_name | index_name | non_unique | seq_in_index | column_name | direction | storing | implicit +-------------+-----------------------------------------------+------------+--------------+--------------+-----------+---------+----------- + rides | primary | false | 1 | city | ASC | false | false + rides | primary | false | 2 | id | ASC | false | false +... + rides | rides_city_revenue_idx | true | 1 | city | ASC | false | false + rides | rides_city_revenue_idx | true | 2 | revenue | ASC | false | false + rides | rides_city_revenue_idx | true | 3 | id | ASC | false | true +(12 rows) +~~~ + +Another `EXPLAIN` statement shows that the number of rows scanned by the original query decreases significantly with a partial index on the `rides` table: + +{% include copy-clipboard.html %} +~~~ sql +> EXPLAIN SELECT * FROM rides WHERE revenue > 80; +~~~ + +~~~ + tree | field | description +-------------+---------------------+----------------------------------------------- + | distribution | full + | vectorized | true + index join | | + │ | table | rides@primary + └── scan | | + | estimated row count | 5829 + | table | rides@rides_city_revenue_idx (partial index) + | spans | FULL SCAN +(8 rows) +~~~ + +Note that the query's `SELECT` statement queries all columns in the `rides` table, not just the indexed columns. As a result, an "index join" is required on both the primary index and the partial index. + +Querying only the columns in the index will make the query more efficient by removing the index join from the query plan: + +{% include copy-clipboard.html %} +~~~ sql +> EXPLAIN SELECT city, revenue FROM rides WHERE revenue > 80; +~~~ + +~~~ + tree | field | description +-------+---------------------+----------------------------------------------- + | distribution | full + | vectorized | true + scan | | + | estimated row count | 5829 + | table | rides@rides_city_revenue_idx (partial index) + | spans | FULL SCAN +(6 rows) +~~~ + +Querying a subset of the rows implied by the partial index predicate expression (in this case, `revenue > 80`) will also use the partial index: + +{% include copy-clipboard.html %} +~~~ sql +> EXPLAIN SELECT city, revenue FROM rides WHERE revenue > 95; +~~~ + +~~~ + tree | field | description +------------+---------------------+----------------------------------------------- + | distribution | full + | vectorized | true + filter | | + │ | filter | revenue > 95 + └── scan | | + | estimated row count | 5829 + | table | rides@rides_city_revenue_idx (partial index) + | spans | FULL SCAN +(8 rows) +~~~ + +The number of rows scanned is the same, and an additional filter is applied to the query plan so that only the subset specified by the filter is returned. + +So far, all the query scans in this example have spanned the entire partial index (i.e., performed a `FULL SCAN` of the index). This is because the `WHERE` clause does not filter on the first column in the index prefix (`city`). Filtering the query on both columns in the partial index will limit the scan to just the rows that match the filter: + +{% include copy-clipboard.html %} +~~~ sql +> EXPLAIN SELECT city, revenue FROM rides WHERE city = 'new york' AND revenue > 80; +~~~ + +~~~ + tree | field | description +-------+---------------------+----------------------------------------------- + | distribution | local + | vectorized | false + scan | | + | estimated row count | 660 + | table | rides@rides_city_revenue_idx (partial index) + | spans | [/'new york' - /'new york'] +(6 rows) +~~~ + +Refining the `revenue` filter expression to match just a subset of the partial index will lower the scanned row count even more: + +{% include copy-clipboard.html %} +~~~ sql +> EXPLAIN SELECT city, revenue FROM rides WHERE city = 'new york' AND revenue >= 90 AND revenue < 95; +~~~ + +~~~ + tree | field | description +-------+---------------------+----------------------------------------------- + | distribution | local + | vectorized | false + scan | | + | estimated row count | 174 + | table | rides@rides_city_revenue_idx (partial index) + | spans | [/'new york'/90 - /'new york'/95) +(6 rows) +~~~ + +### Create an index that excludes values + +Suppose that you have a number of rows in a table with values that you regularly filter out of selection queries (e.g., `NULL` values). + +A selection query on these values will require a full table scan, using the primary index, as shown by the [`EXPLAIN` statement](explain.html) below: + +{% include copy-clipboard.html %} +~~~ sql +> EXPLAIN SELECT * FROM rides WHERE end_time IS NOT NULL; +~~~ + +~~~ + tree | field | description +------------+---------------------+----------------------- + | distribution | full + | vectorized | true + filter | | + │ | filter | end_time IS NOT NULL + └── scan | | + | estimated row count | 31337 + | table | rides@primary + | spans | FULL SCAN +(8 rows) +~~~ + +You can create a partial index that excludes these rows, making queries that filter out the non-`NULL` values more efficient. + +{% include copy-clipboard.html %} +~~~ sql +> CREATE INDEX ON rides (city, revenue) WHERE end_time IS NOT NULL; +~~~ + +{% include copy-clipboard.html %} +~~~ sql +> SHOW INDEXES FROM rides; +~~~ + +~~~ + table_name | index_name | non_unique | seq_in_index | column_name | direction | storing | implicit +-------------+-----------------------------------------------+------------+--------------+--------------+-----------+---------+----------- + rides | primary | false | 1 | city | ASC | false | false + rides | primary | false | 2 | id | ASC | false | false +... + rides | rides_city_revenue_idx | true | 1 | city | ASC | false | false + rides | rides_city_revenue_idx | true | 2 | revenue | ASC | false | false + rides | rides_city_revenue_idx | true | 3 | id | ASC | false | true +(12 rows) +~~~ + +{% include copy-clipboard.html %} +~~~ sql +> EXPLAIN SELECT (city, revenue) FROM rides WHERE end_time IS NOT NULL; +~~~ + +~~~ + tree | field | description +------------+---------------------+----------------------------------------------- + | distribution | full + | vectorized | false + render | | + └── scan | | + | estimated row count | 5822 + | table | rides@rides_city_revenue_idx (partial index) + | spans | FULL SCAN +(7 rows) +~~~ + + +### Create a partial index that enforces uniqueness on a subset of rows + +Suppose that you want to constrain a subset of the rows in a table, such that all values for a particular column in the subset are unique. For example, let's say that every user in New York City must have a unique name. + +You can do this efficiently with a [unique partial index](#unique-partial-indexes): + +{% include copy-clipboard.html %} +~~~ sql +> CREATE UNIQUE INDEX ON users (name) WHERE city='new york'; +~~~ + +This creates a partial index and a [`UNIQUE` constraint](unique.html) on just the subset of rows where `city='new york'`. + +{% include copy-clipboard.html %} +~~~ sql +> SELECT id, name FROM users WHERE city='new york' LIMIT 3; +~~~ + +~~~ + id | name +---------------------------------------+------------------- + 8647a7cf-4af0-4c82-9344-224097f87b1a | Andre Sanchez + 598eaab2-5200-40cb-8e19-244d49f3f63a | Austin Meyer + 147ae147-ae14-4b00-8000-000000000004 | Catherine Nelson +(3 rows) +~~~ + +{% include copy-clipboard.html %} +~~~ sql +> INSERT INTO users(id, city, name) VALUES (gen_random_uuid(), 'new york', 'Andre Sanchez'); +~~~ + +~~~ +ERROR: duplicate key value (name)=('Andre Sanchez') violates unique constraint "users_name_key" +SQLSTATE: 23505 +~~~ + +Because the unique partial index predicate only implies the rows where `city='new york'`, the `UNIQUE` constraint does not apply to all rows in the table. + +{% include copy-clipboard.html %} +~~~ sql +> INSERT INTO users(id, city, name) VALUES (gen_random_uuid(), 'seattle', 'Andre Sanchez'); +~~~ + +~~~ +INSERT 1 +~~~ + +## See also + +- [Indexes](indexes.html) +- [`CREATE INDEX`](create-index.html) +- [`DROP INDEX`](drop-index.html) +- [`RENAME INDEX`](rename-index.html) +- [`SHOW INDEX`](show-index.html) +- [SQL Statements](sql-statements.html) diff --git a/v20.2/sql-feature-support.md b/v20.2/sql-feature-support.md index 4da9746eb20..fef70bdc20d 100644 --- a/v20.2/sql-feature-support.md +++ b/v20.2/sql-feature-support.md @@ -72,16 +72,16 @@ table tr td:nth-child(2) { Component | Supported | Type | Details -----------|-----------|------|--------- - Indexes | ✓ | Common Extension | [Indexes documentation](https://www.cockroachlabs.com/docs/indexes.html) + Indexes | ✓ | Common Extension | [Indexes documentation](indexes.html) Multi-column indexes | ✓ | Common Extension | We do not limit on the number of columns indexes can include Covering indexes | ✓ | Common Extension | [Storing Columns documentation](create-index.html#store-columns) Inverted indexes | ✓ | Common Extension | [Inverted Indexes documentation](inverted-indexes.html) - Multiple indexes per query | Planned | Common Extension | Use multiple indexes to filter the table's values for a single query + Partial indexes | ✓ | Common Extension | [Partial indexes documentation](partial-indexes.html) + Multiple indexes per query | Partial | Common Extension | [Use multiple indexes for a single query](https://github.com/cockroachdb/cockroach/issues/2142) Full-text indexes | Planned | Common Extension | [GitHub issue tracking full-text index support](https://github.com/cockroachdb/cockroach/issues/7821) Prefix/Expression Indexes | Potential | Common Extension | Apply expressions (such as `LOWER()`) to values before indexing them Geospatial indexes | Potential | Common Extension | Improves performance of queries calculating geospatial data Hash indexes | ✗ | Common Extension | Improves performance of queries looking for single, exact values - Partial indexes | ✗ | Common Extension | Only index specific rows from indexed columns ### Schema changes