From bc6379b9213e4f620cbe1f69c5d95a65946e2038 Mon Sep 17 00:00:00 2001 From: carlosdelest Date: Thu, 5 Oct 2023 17:27:20 +0200 Subject: [PATCH] Add sparse-vector field type to docs, changed references --- docs/reference/mapping/types.asciidoc | 1 + .../mapping/types/sparse-vector.asciidoc | 36 ++++ .../query-dsl/text-expansion-query.asciidoc | 54 +++--- .../semantic-search-elser.asciidoc | 170 +++++++++--------- 4 files changed, 149 insertions(+), 112 deletions(-) create mode 100644 docs/reference/mapping/types/sparse-vector.asciidoc diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index 7108d536f8715..429e7012eff29 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -83,6 +83,7 @@ as-you-type completion. ==== Document ranking types <>:: Records dense vectors of float values. +<>:: Records sparse vectors of float values. <>:: Records a numeric feature to boost hits at query time. <>:: Records numeric features to boost hits at diff --git a/docs/reference/mapping/types/sparse-vector.asciidoc b/docs/reference/mapping/types/sparse-vector.asciidoc new file mode 100644 index 0000000000000..17a193eef1d4d --- /dev/null +++ b/docs/reference/mapping/types/sparse-vector.asciidoc @@ -0,0 +1,36 @@ +[[sparse-vector]] +=== Sparse vector field type +++++ +Sparse vector +++++ + +A `sparse_vector` field can index features and weights so that they can later be used to query +documents in queries with a <> query. + +`sparse_vector` is the field type that should be used with <>. + +[source,console] +-------------------------------------------------- +PUT my-index +{ + "mappings": { + "properties": { + "text.tokens": { + "type": "sparse_vector" + } + } + } +} +-------------------------------------------------- + +See <> for a complete example on adding documents + to a `sparse_vector` mapped field using ELSER. + +NOTE: `sparse_vector` fields only support single-valued fields and strictly positive +values. Multi-valued fields and negative values will be rejected. + +NOTE: `sparse_vector` fields do not support querying, sorting or aggregating. They may +only be used within <> queries. + +NOTE: `sparse_vector` fields only preserve 9 significant bits for the precision, which +translates to a relative error of about 0.4%. diff --git a/docs/reference/query-dsl/text-expansion-query.asciidoc b/docs/reference/query-dsl/text-expansion-query.asciidoc index 74ee80ba821a1..d15fd40846529 100644 --- a/docs/reference/query-dsl/text-expansion-query.asciidoc +++ b/docs/reference/query-dsl/text-expansion-query.asciidoc @@ -4,9 +4,9 @@ Text expansion ++++ -The text expansion query uses a {nlp} model to convert the query text into a -list of token-weight pairs which are then used in a query against a -<>. +The text expansion query uses a {nlp} model to convert the query text into a +list of token-weight pairs which are then used in a query against a +<> or <> field. [discrete] [[text-expansion-query-ex-request]] @@ -19,7 +19,7 @@ GET _search { "query":{ "text_expansion":{ - "":{ + "":{ "model_id":"the model to produce the token weights", "model_text":"the query string" } @@ -33,33 +33,33 @@ GET _search [[text-expansion-query-params]] === Top level parameters for `text_expansion` -``::: +``::: (Required, object) -The name of the field that contains the token-weight pairs the NLP model created +The name of the field that contains the token-weight pairs the NLP model created based on the input text. [discrete] [[text-expansion-rank-feature-field-params]] -=== Top level parameters for `` +=== Top level parameters for `` `model_id`:::: (Required, string) -The ID of the model to use to convert the query text into token-weight pairs. It -must be the same model ID that was used to create the tokens from the input +The ID of the model to use to convert the query text into token-weight pairs. It +must be the same model ID that was used to create the tokens from the input text. `model_text`:::: (Required, string) -The query text you want to use for search. +The query text you want to use for search. [discrete] [[text-expansion-query-example]] === Example -The following is an example of the `text_expansion` query that references the -ELSER model to perform semantic search. For a more detailed description of how -to perform semantic search by using ELSER and the `text_expansion` query, refer +The following is an example of the `text_expansion` query that references the +ELSER model to perform semantic search. For a more detailed description of how +to perform semantic search by using ELSER and the `text_expansion` query, refer to <>. [source,console] @@ -82,25 +82,25 @@ GET my-index/_search [[optimizing-text-expansion]] === Optimizing the search performance of the text_expansion query -https://www.elastic.co/blog/faster-retrieval-of-top-hits-in-elasticsearch-with-block-max-wand[Max WAND] -is an optimization technique used by {es} to skip documents that cannot score -competitively against the current best matching documents. However, the tokens -generated by the ELSER model don't work well with the Max WAND optimization. -Consequently, enabling Max WAND can actually increase query latency for -`text_expansion`. For datasets of a significant size, disabling Max +https://www.elastic.co/blog/faster-retrieval-of-top-hits-in-elasticsearch-with-block-max-wand[Max WAND] +is an optimization technique used by {es} to skip documents that cannot score +competitively against the current best matching documents. However, the tokens +generated by the ELSER model don't work well with the Max WAND optimization. +Consequently, enabling Max WAND can actually increase query latency for +`text_expansion`. For datasets of a significant size, disabling Max WAND leads to lower query latencies. Max WAND is controlled by the -<> query parameter. Setting track_total_hits -to true forces {es} to consider all documents, resulting in lower query -latencies for the `text_expansion` query. However, other {es} queries run slower +<> query parameter. Setting track_total_hits +to true forces {es} to consider all documents, resulting in lower query +latencies for the `text_expansion` query. However, other {es} queries run slower when Max WAND is disabled. -If you are combining the `text_expansion` query with standard text queries in a -compound search, it is recommended to measure the query performance before +If you are combining the `text_expansion` query with standard text queries in a +compound search, it is recommended to measure the query performance before deciding which setting to use. -NOTE: The `track_total_hits` option applies to all queries in the search request -and may be optimal for some queries but not for others. Take into account the -characteristics of all your queries to determine the most suitable +NOTE: The `track_total_hits` option applies to all queries in the search request +and may be optimal for some queries but not for others. Take into account the +characteristics of all your queries to determine the most suitable configuration. diff --git a/docs/reference/search/search-your-data/semantic-search-elser.asciidoc b/docs/reference/search/search-your-data/semantic-search-elser.asciidoc index 082bb2ae2e020..0b4956dbe86ad 100644 --- a/docs/reference/search/search-your-data/semantic-search-elser.asciidoc +++ b/docs/reference/search/search-your-data/semantic-search-elser.asciidoc @@ -4,18 +4,18 @@ Semantic search with ELSER ++++ -Elastic Learned Sparse EncodeR - or ELSER - is an NLP model trained by Elastic -that enables you to perform semantic search by using sparse vector -representation. Instead of literal matching on search terms, semantic search -retrieves results based on the intent and the contextual meaning of a search +Elastic Learned Sparse EncodeR - or ELSER - is an NLP model trained by Elastic +that enables you to perform semantic search by using sparse vector +representation. Instead of literal matching on search terms, semantic search +retrieves results based on the intent and the contextual meaning of a search query. -The instructions in this tutorial shows you how to use ELSER to perform semantic +The instructions in this tutorial shows you how to use ELSER to perform semantic search on your data. -NOTE: Only the first 512 extracted tokens per field are considered during -semantic search with ELSER. Refer to -{ml-docs}/ml-nlp-limitations.html#ml-nlp-elser-v1-limit-512[this page] for more +NOTE: Only the first 512 extracted tokens per field are considered during +semantic search with ELSER. Refer to +{ml-docs}/ml-nlp-limitations.html#ml-nlp-elser-v1-limit-512[this page] for more information. @@ -23,18 +23,18 @@ information. [[requirements]] ==== Requirements -To perform semantic search by using ELSER, you must have the NLP model deployed -in your cluster. Refer to the -{ml-docs}/ml-nlp-elser.html[ELSER documentation] to learn how to download and +To perform semantic search by using ELSER, you must have the NLP model deployed +in your cluster. Refer to the +{ml-docs}/ml-nlp-elser.html[ELSER documentation] to learn how to download and deploy the model. -NOTE: The minimum dedicated ML node size for deploying and using the ELSER model -is 4 GB in Elasticsearch Service if -{cloud}/ec-autoscaling.html[deployment autoscaling] is turned off. Turning on -autoscaling is recommended because it allows your deployment to dynamically -adjust resources based on demand. Better performance can be achieved by using -more allocations or more threads per allocation, which requires bigger ML nodes. -Autoscaling provides bigger nodes when required. If autoscaling is turned off, +NOTE: The minimum dedicated ML node size for deploying and using the ELSER model +is 4 GB in Elasticsearch Service if +{cloud}/ec-autoscaling.html[deployment autoscaling] is turned off. Turning on +autoscaling is recommended because it allows your deployment to dynamically +adjust resources based on demand. Better performance can be achieved by using +more allocations or more threads per allocation, which requires bigger ML nodes. +Autoscaling provides bigger nodes when required. If autoscaling is turned off, you must provide suitably sized nodes yourself. @@ -42,17 +42,17 @@ you must provide suitably sized nodes yourself. [[elser-mappings]] ==== Create the index mapping -First, the mapping of the destination index - the index that contains the tokens -that the model created based on your text - must be created. The destination -index must have a field with the -<> field type to index the +First, the mapping of the destination index - the index that contains the tokens +that the model created based on your text - must be created. The destination +index must have a field with the +<> or <> field type to index the ELSER output. -NOTE: ELSER output must be ingested into a field with the `sparse_vector` or -`rank_features` field type. Otherwise, {es} interprets the token-weight pairs as -a massive amount of fields in a document. If you get an error similar to this -`"Limit of total fields [1000] has been exceeded while adding new fields"` then -the ELSER output field is not mapped properly and it has a field type different +NOTE: ELSER output must be ingested into a field with the `sparse_vector` or +`rank_features` field type. Otherwise, {es} interprets the token-weight pairs as +a massive amount of fields in a document. If you get an error similar to this +`"Limit of total fields [1000] has been exceeded while adding new fields"` then +the ELSER output field is not mapped properly and it has a field type different than `sparse_vector` or `rank_features`. [source,console] @@ -74,19 +74,19 @@ PUT my-index // TEST[skip:TBD] <1> The name of the field to contain the generated tokens. <2> The field to contain the tokens is a `sparse_vector` field. -<3> The name of the field from which to create the sparse vector representation. +<3> The name of the field from which to create the sparse vector representation. In this example, the name of the field is `text`. <4> The field type which is text in this example. -To learn how to optimize space, refer to the <> section. +To learn how to optimize space, refer to the <> section. [discrete] [[inference-ingest-pipeline]] ==== Create an ingest pipeline with an inference processor -Create an <> with an -<> to use ELSER to infer against the data +Create an <> with an +<> to use ELSER to infer against the data that is being ingested in the pipeline. [source,console] @@ -112,10 +112,10 @@ PUT _ingest/pipeline/elser-v2-test } ---- // TEST[skip:TBD] -<1> The `field_map` object maps the input document field name (which is `text` -in this example) to the name of the field that the model expects (which is +<1> The `field_map` object maps the input document field name (which is `text` +in this example) to the name of the field that the model expects (which is always `text_field`). -<2> The `text_expansion` inference type needs to be used in the {infer} ingest +<2> The `text_expansion` inference type needs to be used in the {infer} ingest processor. @@ -123,19 +123,19 @@ processor. [[load-data]] ==== Load data -In this step, you load the data that you later use in the {infer} ingest +In this step, you load the data that you later use in the {infer} ingest pipeline to extract tokens from it. -Use the `msmarco-passagetest2019-top1000` data set, which is a subset of the MS -MARCO Passage Ranking data set. It consists of 200 queries, each accompanied by -a list of relevant text passages. All unique passages, along with their IDs, -have been extracted from that data set and compiled into a +Use the `msmarco-passagetest2019-top1000` data set, which is a subset of the MS +MARCO Passage Ranking data set. It consists of 200 queries, each accompanied by +a list of relevant text passages. All unique passages, along with their IDs, +have been extracted from that data set and compiled into a https://github.com/elastic/stack-docs/blob/main/docs/en/stack/ml/nlp/data/msmarco-passagetest2019-unique.tsv[tsv file]. -Download the file and upload it to your cluster using the -{kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[Data Visualizer] -in the {ml-app} UI. Assign the name `id` to the first column and `text` to the -second column. The index name is `test-data`. Once the upload is complete, you +Download the file and upload it to your cluster using the +{kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[Data Visualizer] +in the {ml-app} UI. Assign the name `id` to the first column and `text` to the +second column. The index name is `test-data`. Once the upload is complete, you can see an index named `test-data` with 182469 documents. @@ -143,7 +143,7 @@ can see an index named `test-data` with 182469 documents. [[reindexing-data-elser]] ==== Ingest the data through the {infer} ingest pipeline -Create the tokens from the text by reindexing the data throught the {infer} +Create the tokens from the text by reindexing the data throught the {infer} pipeline that uses ELSER as the inference model. [source,console] @@ -161,8 +161,8 @@ POST _reindex?wait_for_completion=false } ---- // TEST[skip:TBD] -<1> The default batch size for reindexing is 1000. Reducing `size` to a smaller -number makes the update of the reindexing process quicker which enables you to +<1> The default batch size for reindexing is 1000. Reducing `size` to a smaller +number makes the update of the reindexing process quicker which enables you to follow the progress closely and detect errors early. The call returns a task ID to monitor the progress: @@ -173,7 +173,7 @@ GET _tasks/ ---- // TEST[skip:TBD] -You can also open the Trained Models UI, select the Pipelines tab under ELSER to +You can also open the Trained Models UI, select the Pipelines tab under ELSER to follow the progress. @@ -181,9 +181,9 @@ follow the progress. [[text-expansion-query]] ==== Semantic search by using the `text_expansion` query -To perform semantic search, use the `text_expansion` query, and provide the -query text and the ELSER model ID. The example below uses the query text "How to -avoid muscle soreness after running?", the `ml.tokens` field contains the +To perform semantic search, use the `text_expansion` query, and provide the +query text and the ELSER model ID. The example below uses the query text "How to +avoid muscle soreness after running?", the `ml.tokens` field contains the generated ELSER output: [source,console] @@ -202,9 +202,9 @@ GET my-index/_search ---- // TEST[skip:TBD] -The result is the top 10 documents that are closest in meaning to your query -text from the `my-index` index sorted by their relevancy. The result also -contains the extracted tokens for each of the relevant search results with their +The result is the top 10 documents that are closest in meaning to your query +text from the `my-index` index sorted by their relevancy. The result also +contains the extracted tokens for each of the relevant search results with their weights. [source,consol-result] @@ -246,7 +246,7 @@ weights. ---- // NOTCONSOLE -To learn about optimizing your `text_expansion` query, refer to +To learn about optimizing your `text_expansion` query, refer to <>. @@ -254,16 +254,16 @@ To learn about optimizing your `text_expansion` query, refer to [[text-expansion-compound-query]] ==== Combining semantic search with other queries -You can combine `text_expansion` with other queries in a -<>. For example using a filter clause in a -<> or a full text query which may or may not use the same -query text as the `text_expansion` query. This enables you to combine the search +You can combine `text_expansion` with other queries in a +<>. For example using a filter clause in a +<> or a full text query which may or may not use the same +query text as the `text_expansion` query. This enables you to combine the search results from both queries. -The search hits from the `text_expansion` query tend to score higher than other -{es} queries. Those scores can be regularized by increasing or decreasing the -relevance scores of each query by using the `boost` parameter. Recall on the -`text_expansion` query can be high where there is a long tail of less relevant +The search hits from the `text_expansion` query tend to score higher than other +{es} queries. Those scores can be regularized by increasing or decreasing the +relevance scores of each query by using the `boost` parameter. Recall on the +`text_expansion` query can be high where there is a long tail of less relevant results. Use the `min_score` parameter to prune those less relevant documents. [source,console] @@ -274,7 +274,7 @@ GET my-index/_search "bool": { <1> "should": [ { - "text_expansion": { + "text_expansion": { "ml.tokens": { "model_text": "How to avoid muscle soreness after running?", "model_id": ".elser_model_2", @@ -295,13 +295,13 @@ GET my-index/_search } ---- // TEST[skip:TBD] -<1> Both the `text_expansion` and the `query_string` queries are in a `should` +<1> Both the `text_expansion` and the `query_string` queries are in a `should` clause of a `bool` query. -<2> The `boost` value is `1` for the `text_expansion` query which is the default -value. This means that the relevance score of the results of this query are not +<2> The `boost` value is `1` for the `text_expansion` query which is the default +value. This means that the relevance score of the results of this query are not boosted. -<3> The `boost` value is `4` for the `query_string` query. The relevance score -of the results of this query is increased causing them to rank higher in the +<3> The `boost` value is `4` for the `query_string` query. The relevance score +of the results of this query is increased causing them to rank higher in the search results. <4> Only the results with a score equal to or higher than `10` are displayed. @@ -314,22 +314,22 @@ search results. [[save-space]] ==== Saving disk space by excluding the ELSER tokens from document source -The tokens generated by ELSER must be indexed for use in the -<>. However, it is not -necessary to retain those terms in the document source. You can save disk space -by using the <> mapping to remove the ELSER -terms from the document source. - -WARNING: Reindex uses the document source to populate the destination index. -Once the ELSER terms have been excluded from the source, they cannot be -recovered through reindexing. Excluding the tokens from the source is a -space-saving optimsation that should only be applied if you are certain that -reindexing will not be required in the future! It's important to carefully -consider this trade-off and make sure that excluding the ELSER terms from the +The tokens generated by ELSER must be indexed for use in the +<>. However, it is not +necessary to retain those terms in the document source. You can save disk space +by using the <> mapping to remove the ELSER +terms from the document source. + +WARNING: Reindex uses the document source to populate the destination index. +Once the ELSER terms have been excluded from the source, they cannot be +recovered through reindexing. Excluding the tokens from the source is a +space-saving optimsation that should only be applied if you are certain that +reindexing will not be required in the future! It's important to carefully +consider this trade-off and make sure that excluding the ELSER terms from the source aligns with your specific requirements and use case. -The mapping that excludes `ml.tokens` from the `_source` field can be created -by the following API call: +The mapping that excludes `ml.tokens` from the `_source` field can be created +by the following API call: [source,console] ---- @@ -343,10 +343,10 @@ PUT my-index }, "properties": { "ml.tokens": { - "type": "sparse_vector" + "type": "sparse_vector" }, - "text": { - "type": "text" + "text": { + "type": "text" } } }