From 8c939b0f1e982d6cf6c72787711a75265c7e8dff Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Tue, 15 Nov 2022 10:59:16 -0500 Subject: [PATCH] Adds search documentation (#1752) * Adds search documentation Signed-off-by: Fanit Kolchina * Incorporated review comments Signed-off-by: Fanit Kolchina * Incorporated doc review feedback Signed-off-by: Fanit Kolchina * Apply suggestions from code review Co-authored-by: Nate Bower * Apply suggestions from code review Co-authored-by: Nate Bower * Implemented editorial comments Signed-off-by: Fanit Kolchina * Minor style edits Signed-off-by: Fanit Kolchina * Grammar edits Signed-off-by: Fanit Kolchina * Minor edit Signed-off-by: Fanit Kolchina Signed-off-by: Fanit Kolchina Co-authored-by: Nate Bower --- _opensearch/search/autocomplete.md | 1031 ++++++++++++++++ _opensearch/search/did-you-mean.md | 568 +++++++++ _opensearch/search/highlight.md | 964 +++++++++++++++ _opensearch/search/index.md | 20 + _opensearch/search/paginate.md | 274 +++++ _opensearch/search/sort.md | 880 ++++++++++++++ .../supported-field-types/completion.md | 2 +- _opensearch/supported-field-types/nested.md | 14 +- _opensearch/ux.md | 1069 ----------------- 9 files changed, 3745 insertions(+), 1077 deletions(-) create mode 100644 _opensearch/search/autocomplete.md create mode 100644 _opensearch/search/did-you-mean.md create mode 100644 _opensearch/search/highlight.md create mode 100644 _opensearch/search/index.md create mode 100644 _opensearch/search/paginate.md create mode 100644 _opensearch/search/sort.md delete mode 100644 _opensearch/ux.md diff --git a/_opensearch/search/autocomplete.md b/_opensearch/search/autocomplete.md new file mode 100644 index 0000000000..eac088fb05 --- /dev/null +++ b/_opensearch/search/autocomplete.md @@ -0,0 +1,1031 @@ +--- +layout: default +title: Autocomplete +parent: Searching data +nav_order: 24 +--- + +# Autocomplete functionality + +Autocomplete shows suggestions to users while they type. + +For example, if a user types "pop," OpenSearch provides suggestions like "popcorn" or "popsicles." These suggestions preempt your user's intention and lead them to a possible search term more quickly. + +OpenSearch lets you design autocomplete that updates with each keystroke, provides a few relevant suggestions, and tolerates typos. + +Implement autocomplete using one of the following methods: + +- [Prefix matching](#prefix-matching) +- [Edge n-gram matching](#edge-n-gram-matching) +- [Search as you type](#search-as-you-type) +- [Completion suggesters](#completion-suggester) + +While prefix matching happens at query time, the other three methods happen at index time. All methods are described in the following sections. + +## Prefix matching + +Prefix matching finds documents that match the last term in a query string. + +For example, assume that the user types “qui” into a search UI. To autocomplete this phrase, use the `match_phrase_prefix` query to search for all `text_entry` field values that begin with the prefix "qui": + +```json +GET shakespeare/_search +{ + "query": { + "match_phrase_prefix": { + "text_entry": { + "query": "qui", + "slop": 3 + } + } + } +} +``` + +To make the word order and relative positions flexible, specify a `slop` value. To learn about the `slop` option, see [Other optional query fields]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text#other-optional-query-fields). + +Prefix matching doesn’t require any special mappings. It works with your data as is. +However, it’s a fairly resource-intensive operation. A prefix of `a` could match hundreds of thousands of terms and not be useful to your user. +To limit the impact of prefix expansion, set `max_expansions` to a reasonable number: + +```json +GET shakespeare/_search +{ + "query": { + "match_phrase_prefix": { + "text_entry": { + "query": "qui", + "slop": 3, + "max_expansions": 10 + } + } + } +} +``` + +To learn about the `max_expansions` option, see [Other optional query fields]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text#other-optional-query-fields). + +The ease of implementing query-time autocomplete comes at the cost of performance. +When implementing this feature on a large scale, we recommend an index-time solution. With an index-time solution, you might experience slower indexing, but it’s a price you pay only once and not for every query. The edge n-gram, search-as-you-type, and completion suggester methods are index-time solutions. + +## Edge n-gram matching + +During indexing, edge n-grams split a word into a sequence of n characters to support a faster lookup of partial search terms. + +If you n-gram the word "quick," the results depend on the value of n. + +n | Type | n-gram +:--- | :--- | :--- +1 | Unigram | [ `q`, `u`, `i`, `c`, `k` ] +2 | Bigram | [ `qu`, `ui`, `ic`, `ck` ] +3 | Trigram | [ `qui`, `uic`, `ick` ] +4 | Four-gram | [ `quic`, `uick` ] +5 | Five-gram | [ `quick` ] + +Autocomplete needs only the beginning n-grams of a search phrase, so OpenSearch uses a special type of n-gram called *edge n-gram*. + +Edge n-gramming the word "quick" results in the following: + +- `q` +- `qu` +- `qui` +- `quic` +- `quick` + +This follows the same sequence the user types. + +To configure a field to use edge n-grams, create an autocomplete analyzer with an `edge_ngram` filter: + + +```json +PUT shakespeare +{ + "mappings": { + "properties": { + "text_entry": { + "type": "text", + "analyzer": "autocomplete" + } + } + }, + "settings": { + "analysis": { + "filter": { + "edge_ngram_filter": { + "type": "edge_ngram", + "min_gram": 1, + "max_gram": 20 + } + }, + "analyzer": { + "autocomplete": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "edge_ngram_filter" + ] + } + } + } + } +} +``` + +This example creates the index and instantiates the edge n-gram filter and analyzer. + +The `edge_ngram_filter` produces edge n-grams with a minimum n-gram length of 1 (a single letter) and a maximum length of 20. So it offers suggestions for words of up to 20 letters. + +The `autocomplete` analyzer tokenizes a string into individual terms, lowercases the terms, and then produces edge n-grams for each term using the `edge_ngram_filter`. + +Use the `analyze` operation to test this analyzer: + +```json +POST shakespeare/_analyze +{ + "analyzer": "autocomplete", + "text": "quick" +} +``` + +It returns edge n-grams as tokens: + +* `q` +* `qu` +* `qui` +* `quic` +* `quick` + +Use the `standard` analyzer at search time. Otherwise, the search query splits into edge n-grams and you get results for everything that matches `q`, `u`, and `i`. +This is one of the few occasions when you use different analyzers at index time and at query time: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": { + "query": "qui", + "analyzer": "standard" + } + } + } +} +``` + +The response contains the matching documents: + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 533, + "relation": "eq" + }, + "max_score": 9.712725, + "hits": [ + { + "_index": "shakespeare", + "_id": "22006", + "_score": 9.712725, + "_source": { + "type": "line", + "line_id": 22007, + "play_name": "Antony and Cleopatra", + "speech_number": 12, + "line_number": "5.2.44", + "speaker": "CLEOPATRA", + "text_entry": "Quick, quick, good hands." + } + }, + { + "_index": "shakespeare", + "_id": "54665", + "_score": 9.712725, + "_source": { + "type": "line", + "line_id": 54666, + "play_name": "Loves Labours Lost", + "speech_number": 21, + "line_number": "5.1.52", + "speaker": "HOLOFERNES", + "text_entry": "Quis, quis, thou consonant?" + } + } + ... + ] + } +} +``` + +Alternatively, specify the `search_analyzer` in the mapping itself: + +```json +"mappings": { + "properties": { + "text_entry": { + "type": "text", + "analyzer": "autocomplete", + "search_analyzer": "standard" + } + } +} +``` + +## Completion suggester + +The completion suggester accepts a list of suggestions and builds them into a finite-state transducer (FST), an optimized data structure that is essentially a graph. This data structure lives in memory and is optimized for fast prefix lookups. To learn more about FSTs, see [Wikipedia](https://en.wikipedia.org/wiki/Finite-state_transducer). + +As the user types, the completion suggester moves through the FST graph one character at a time along a matching path. After it runs out of user input, it examines the remaining endings to produce a list of suggestions. + +The completion suggester makes your autocomplete solution as efficient as possible and lets you have explicit control over its suggestions. + +Use a dedicated field type called [`completion`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/completion), which stores the FST-like data structures in the index: + +```json +PUT shakespeare +{ + "mappings": { + "properties": { + "text_entry": { + "type": "completion" + } + } + } +} +``` + +To get suggestions, use the `search` endpoint with the `suggest` parameter: + +```json +GET shakespeare/_search +{ + "suggest": { + "autocomplete": { + "prefix": "To be", + "completion": { + "field": "text_entry" + } + } + } +} +``` + +The phrase "to be" is prefix matched with the FST of the `text_entry` field: + +```json +{ + "took" : 29, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "autocomplete" : [ + { + "text" : "To be", + "offset" : 0, + "length" : 5, + "options" : [ + { + "text" : "To be a comrade with the wolf and owl,--", + "_index" : "shakespeare", + "_id" : "50652", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 50653, + "play_name" : "King Lear", + "speech_number" : 68, + "line_number" : "2.4.230", + "speaker" : "KING LEAR", + "text_entry" : "To be a comrade with the wolf and owl,--" + } + }, + { + "text" : "To be a make-peace shall become my age:", + "_index" : "shakespeare", + "_id" : "78566", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 78567, + "play_name" : "Richard II", + "speech_number" : 20, + "line_number" : "1.1.160", + "speaker" : "JOHN OF GAUNT", + "text_entry" : "To be a make-peace shall become my age:" + } + }, + { + "text" : "To be a party in this injury.", + "_index" : "shakespeare", + "_id" : "75259", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 75260, + "play_name" : "Othello", + "speech_number" : 57, + "line_number" : "5.1.93", + "speaker" : "IAGO", + "text_entry" : "To be a party in this injury." + } + }, + { + "text" : "To be a preparation gainst the Polack;", + "_index" : "shakespeare", + "_id" : "33591", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 33592, + "play_name" : "Hamlet", + "speech_number" : 17, + "line_number" : "2.2.67", + "speaker" : "VOLTIMAND", + "text_entry" : "To be a preparation gainst the Polack;" + } + }, + { + "text" : "To be a public spectacle to all:", + "_index" : "shakespeare", + "_id" : "3709", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 3710, + "play_name" : "Henry VI Part 1", + "speech_number" : 6, + "line_number" : "1.4.41", + "speaker" : "TALBOT", + "text_entry" : "To be a public spectacle to all:" + } + } + ] + } + ] + } +} +``` + +To specify the number of suggestions that you want to return, use the `size` parameter: + +```json +GET shakespeare/_search +{ + "suggest": { + "autocomplete": { + "prefix": "To n", + "completion": { + "field": "text_entry", + "size": 3 + } + } + } +} +``` + +The maximum of three documents is returned: + +```json +{ + "took" : 4109, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "autocomplete" : [ + { + "text" : "To n", + "offset" : 0, + "length" : 4, + "options" : [ + { + "text" : "To NESTOR", + "_index" : "shakespeare", + "_id" : "99707", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 99708, + "play_name" : "Troilus and Cressida", + "speech_number" : 3, + "line_number" : "", + "speaker" : "ULYSSES", + "text_entry" : "To NESTOR" + } + }, + { + "text" : "To name the bigger light, and how the less,", + "_index" : "shakespeare", + "_id" : "91884", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 91885, + "play_name" : "The Tempest", + "speech_number" : 91, + "line_number" : "1.2.394", + "speaker" : "CALIBAN", + "text_entry" : "To name the bigger light, and how the less," + } + }, + { + "text" : "To nature none more bound; his training such,", + "_index" : "shakespeare", + "_id" : "40510", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 40511, + "play_name" : "Henry VIII", + "speech_number" : 18, + "line_number" : "1.2.126", + "speaker" : "KING HENRY VIII", + "text_entry" : "To nature none more bound; his training such," + } + } + ] + } + ] + } +} +``` + +The `suggest` parameter finds suggestions using only prefix matching. +For example, the document "To be, or not to be" is not part of the results. If you want specific documents returned as suggestions, you can manually add curated suggestions and add weights to prioritize your suggestions. + +Index a document with input suggestions and assign a weight: + +```json +PUT shakespeare/_doc/1?refresh=true +{ + "text_entry": { + "input": [ + "To n", "To be, or not to be: that is the question:" + ], + "weight": 10 + } +} +``` + +Perform the same search: + +```json +GET shakespeare/_search +{ + "suggest": { + "autocomplete": { + "prefix": "To n", + "completion": { + "field": "text_entry", + "size": 3 + } + } + } +} +``` + +You see the indexed document as the first result: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "autocomplete" : [ + { + "text" : "To n", + "offset" : 0, + "length" : 4, + "options" : [ + { + "text" : "To n", + "_index" : "shakespeare", + "_id" : "1", + "_score" : 10.0, + "_source" : { + "text_entry" : { + "input" : [ + "To n", + "To be, or not to be: that is the question:" + ], + "weight" : 10 + } + } + }, + { + "text" : "To NESTOR", + "_index" : "shakespeare", + "_id" : "99707", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 99708, + "play_name" : "Troilus and Cressida", + "speech_number" : 3, + "line_number" : "", + "speaker" : "ULYSSES", + "text_entry" : "To NESTOR" + } + }, + { + "text" : "To name the bigger light, and how the less,", + "_index" : "shakespeare", + "_id" : "91884", + "_score" : 1.0, + "_source" : { + "type" : "line", + "line_id" : 91885, + "play_name" : "The Tempest", + "speech_number" : 91, + "line_number" : "1.2.394", + "speaker" : "CALIBAN", + "text_entry" : "To name the bigger light, and how the less," + } + } + ] + } + ] + } +} +``` + +You can also allow for misspellings in queries by specifying the `fuzzy` parameter: + +```json +GET shakespeare/_search +{ + "suggest": { + "autocomplete": { + "prefix": "rosenkrantz", + "completion": { + "field": "text_entry", + "size": 3, + "fuzzy" : { + "fuzziness" : "AUTO" + } + } + } + } +} +``` + +The result matches the correct spelling: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "autocomplete" : [ + { + "text" : "rosenkrantz", + "offset" : 0, + "length" : 11, + "options" : [ + { + "text" : "ROSENCRANTZ:", + "_index" : "shakespeare", + "_id" : "35196", + "_score" : 5.0, + "_source" : { + "type" : "line", + "line_id" : 35197, + "play_name" : "Hamlet", + "speech_number" : 2, + "line_number" : "4.2.1", + "speaker" : "HAMLET", + "text_entry" : "ROSENCRANTZ:" + } + } + ] + } + ] + } +} +``` + +You can use a regular expression to define the prefix for the completion suggester query: + +```json +GET shakespeare/_search +{ + "suggest": { + "autocomplete": { + "prefix": "rosen*", + "completion": { + "field": "text_entry", + "size": 3 + } + } + } +} +``` + +For more information, see the [`completion` field type documentation]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/completion). + +## Search as you type + +OpenSearch has a dedicated [`search_as_you_type`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/search-as-you-type) field type that is optimized for search-as-you-type functionality and can match terms using both prefix and infix completion. The `search_as_you_type` field does not require you to set up a custom analyzer or index suggestions beforehand. + +First, map the field as `search_as_you_type`: + +```json +PUT shakespeare +{ + "mappings": { + "properties": { + "text_entry": { + "type": "search_as_you_type" + } + } + } +} +``` + +After you index a document, OpenSearch automatically creates and stores its n-grams and edge n-grams. For example, consider the string `that is the question`. First, it is split into terms using the standard analyzer, and the terms are stored in the `text_entry` field: + +```json +[ + "that", + "is", + "the", + "question" +] +``` + +In addition to storing these terms, the following 2-grams for this field are stored in the field `text_entry._2gram`: + +```json +[ + "that is", + "is the", + "the question" +] +``` + +The following 3-grams for this field are stored in the field `text_entry._3gram`: + +```json +[ + "that is the", + "is the question" +] +``` + +Finally, after an edge n-gram token filter is applied, the resulting terms are stored in the `text_entry._index_prefix` field: + +```json +[ + "t", + "th", + "tha", + "that", + ... +] +``` + +You can then match terms in any order using the `bool_prefix` type of a `multi-match` query: + +```json +GET shakespeare/_search +{ + "query": { + "multi_match": { + "query": "uncle what", + "type": "bool_prefix", + "fields": [ + "text_entry", + "text_entry._2gram", + "text_entry._3gram" + ] + } + }, + "size": 3 +} +``` + +The documents in which the words appear in the same order as in the query are ranked higher in the results: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 4759, + "relation" : "eq" + }, + "max_score" : 10.437667, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "2817", + "_score" : 10.437667, + "_source" : { + "type" : "line", + "line_id" : 2818, + "play_name" : "Henry IV", + "speech_number" : 5, + "line_number" : "5.2.31", + "speaker" : "HOTSPUR", + "text_entry" : "Uncle, what news?" + } + }, + { + "_index" : "shakespeare", + "_id" : "37085", + "_score" : 9.437667, + "_source" : { + "type" : "line", + "line_id" : 37086, + "play_name" : "Henry V", + "speech_number" : 26, + "line_number" : "1.2.262", + "speaker" : "KING HENRY V", + "text_entry" : "What treasure, uncle?" + } + }, + { + "_index" : "shakespeare", + "_id" : "79274", + "_score" : 9.358302, + "_source" : { + "type" : "line", + "line_id" : 79275, + "play_name" : "Richard II", + "speech_number" : 29, + "line_number" : "2.1.187", + "speaker" : "KING RICHARD II", + "text_entry" : "Why, uncle, whats the matter?" + } + } + ] + } +} +``` + +To match terms in order, you can use a `match_phrase_prefix` query: + +```json +GET shakespeare/_search +{ + "query": { + "match_phrase_prefix": { + "text_entry": "uncle wha" + } + }, + "size": 3 +} +``` + +The response contains documents that match the prefix: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 6, + "relation" : "eq" + }, + "max_score" : 16.37664, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "2817", + "_score" : 16.37664, + "_source" : { + "type" : "line", + "line_id" : 2818, + "play_name" : "Henry IV", + "speech_number" : 5, + "line_number" : "5.2.31", + "speaker" : "HOTSPUR", + "text_entry" : "Uncle, what news?" + } + }, + { + "_index" : "shakespeare", + "_id" : "6789", + "_score" : 16.37664, + "_source" : { + "type" : "line", + "line_id" : 6790, + "play_name" : "Henry VI Part 2", + "speech_number" : 60, + "line_number" : "1.3.202", + "speaker" : "KING HENRY VI", + "text_entry" : "Uncle, what shall we say to this in law?" + } + }, + { + "_index" : "shakespeare", + "_id" : "7877", + "_score" : 16.37664, + "_source" : { + "type" : "line", + "line_id" : 7878, + "play_name" : "Henry VI Part 2", + "speech_number" : 13, + "line_number" : "3.2.28", + "speaker" : "KING HENRY VI", + "text_entry" : "Where is our uncle? whats the matter, Suffolk?" + } + } + ] + } +} +``` + +Finally, to match the last term exactly and not as a prefix, you can use a `match_phrase` query: + +```json +GET shakespeare/_search +{ + "query": { + "match_phrase": { + "text_entry": "uncle what" + } + }, + "size": 5 +} +``` + +The response contains exact matches: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 14.437452, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "2817", + "_score" : 14.437452, + "_source" : { + "type" : "line", + "line_id" : 2818, + "play_name" : "Henry IV", + "speech_number" : 5, + "line_number" : "5.2.31", + "speaker" : "HOTSPUR", + "text_entry" : "Uncle, what news?" + } + }, + { + "_index" : "shakespeare", + "_id" : "6789", + "_score" : 9.461917, + "_source" : { + "type" : "line", + "line_id" : 6790, + "play_name" : "Henry VI Part 2", + "speech_number" : 60, + "line_number" : "1.3.202", + "speaker" : "KING HENRY VI", + "text_entry" : "Uncle, what shall we say to this in law?" + } + }, + { + "_index" : "shakespeare", + "_id" : "100955", + "_score" : 8.947967, + "_source" : { + "type" : "line", + "line_id" : 100956, + "play_name" : "Troilus and Cressida", + "speech_number" : 28, + "line_number" : "3.2.98", + "speaker" : "CRESSIDA", + "text_entry" : "Well, uncle, what folly I commit, I dedicate to you." + } + } + ] + } +} +``` + +If you modify the text in the previous `match_phrase` query and omit the last letter, none of the documents in the previous response are returned: + +```json +GET shakespeare/_search +{ + "query": { + "match_phrase": { + "text_entry": "uncle wha" + } + } +} +``` + +The result is empty: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + } +} +``` + +For more information, see the [`search_as_you_type` field type documentation]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/search-as-you-type). \ No newline at end of file diff --git a/_opensearch/search/did-you-mean.md b/_opensearch/search/did-you-mean.md new file mode 100644 index 0000000000..2ceedd1503 --- /dev/null +++ b/_opensearch/search/did-you-mean.md @@ -0,0 +1,568 @@ +--- +layout: default +title: Did-you-mean +parent: Searching data +nav_order: 25 +--- + +# Did-you-mean + +The `Did-you-mean` suggester shows suggested corrections for misspelled search terms. + +For example, if a user types "fliud," OpenSearch suggests a corrected search term like "fluid." You can then suggest the corrected term to the user or even automatically correct the search term. + +You can implement the `did-you-mean` suggester using one of the following methods: + +- Use a [term suggester](#term-suggester) to suggest corrections for individual words. +- Use a [phrase suggester](#phrase-suggester) to suggest corrections for phrases. + +## Term suggester + +Use the term suggester to suggest corrected spellings for individual words. +The term suggester uses an [edit distance](https://en.wikipedia.org/wiki/Edit_distance) to compute suggestions. + +The edit distance is the number of single-character insertions, deletions, or substitutions that need to be performed for a term to match another term. For example, to change the word "cat" to "hats", you need to substitute "h" for "c" and insert an "s", so the edit distance in this case is 2. + +To use the term suggester, you don't need any special field mappings for your index. By default, string field types are mapped as `text`. A `text` field is analyzed, so the `title` in the following example is tokenized into individual words. Indexing the following documents creates a `books` index where `title` is a `text` field: + +```json +PUT books/_doc/1 +{ + "title": "Design Patterns (Object-Oriented Software)" +} + +PUT books/_doc/2 +{ + "title": "Software Architecture Patterns Explained" +} +``` + +To check how a string is split into tokens, you can use the `_analyze` endpoint. To apply the same analyzer that the field uses, you can specify the field's name in the `field` parameter: + +```json +GET books/_analyze +{ + "text": "Design Patterns (Object-Oriented Software)", + "field": "title" +} +``` + +The default analyzer (`standard`) splits a string at word boundaries, removes punctuation, and lowercases the tokens: + +```json +{ + "tokens" : [ + { + "token" : "design", + "start_offset" : 0, + "end_offset" : 6, + "type" : "", + "position" : 0 + }, + { + "token" : "patterns", + "start_offset" : 7, + "end_offset" : 15, + "type" : "", + "position" : 1 + }, + { + "token" : "object", + "start_offset" : 17, + "end_offset" : 23, + "type" : "", + "position" : 2 + }, + { + "token" : "oriented", + "start_offset" : 24, + "end_offset" : 32, + "type" : "", + "position" : 3 + }, + { + "token" : "software", + "start_offset" : 33, + "end_offset" : 41, + "type" : "", + "position" : 4 + } + ] +} +``` + +To get suggestions for a misspelled search term, use the term suggester. Specify the input text that needs suggestions in the `text` field, and specify the field from which to get suggestions in the `field` field: + +```json +GET books/_search +{ + "suggest": { + "spell-check": { + "text": "patern", + "term": { + "field": "title" + } + } + } +} +``` + +The term suggester returns a list of corrections for the input text in the `options` array: + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "spell-check" : [ + { + "text" : "patern", + "offset" : 0, + "length" : 6, + "options" : [ + { + "text" : "patterns", + "score" : 0.6666666, + "freq" : 2 + } + ] + } + ] + } +} +``` + +The `score` value is calculated based on the edit distance. The higher the score, the better the suggestion. The `freq` is the frequency that represents the number of times the term appears in the documents of the specified index. + +You can include several suggestions in one request. The following example uses the term suggester for two different suggestions: + +```json +GET books/_search +{ + "suggest": { + "spell-check1" : { + "text" : "patern", + "term" : { + "field" : "title" + } + }, + "spell-check2" : { + "text" : "desing", + "term" : { + "field" : "title" + } + } + } +} +``` + +To receive suggestions for the same input text in multiple fields, you can define the text globally to avoid duplication: + +```json +GET books/_search +{ + "suggest": { + "text" : "patern", + "spell-check1" : { + "term" : { + "field" : "title" + } + }, + "spell-check2" : { + "term" : { + "field" : "subject" + } + } + } +} +``` + +If `text` is specified both at the global and individual suggestion levels, the suggestion-level value overrides the global value. + +### Term suggester options + +You can specify the following options to the term suggester. + +Option | Description +:--- | :--- +field | The field from which to source suggestions. Required. Can be set for each suggestion or globally. +analyzer | The analyzer with which to analyze the input text. Defaults to the analyzer configured for the `field`. +size | The maximum number of suggestions to return for each token in the input text. +sort | Specifies how suggestions should be sorted in the response. Valid values are:
- `score`: Sort by similarity score, then document frequency, and then the term itself.
- `frequency`: Sort by document frequency, then similarity score, and then the term itself. +suggest_mode | The suggest mode specifies the terms for which suggestions should be included in the response. Valid values are:
- `missing`: Return suggestions only for the input text terms that are not in the index.
- `popular`: Return suggestions only if they occur in the documents more frequently than in the original input text.
- `always`: Always return suggestions for each term in the input text.
Default is `missing`. +max_edits | The maximum edit distance for suggestions. Valid values are in the [1, 2] range. Default is 2. +prefix_length | An integer that specifies the minimum length the matched prefix must be to start returning suggestions. If the prefix of `prefix_length` is not matched, but the search term is still within the edit distance, no suggestions are returned. Default is 1. Higher values improve spellcheck performance because misspellings don’t tend to occur in the beginning of words. +min_word_length | The minimum length a suggestion must be in order to be included in the response. Default is 4. +shard_size | The maximum number of candidate suggestions to obtain from each shard. After all candidate suggestions are considered, the top `shard_size` suggestions are returned. Default is equal to the `size` value. Shard-level document frequencies may not be exact because terms may reside in different shards. If `shard_size` is larger than `size`, the document frequencies for suggestions are more accurate, at the cost of decreased performance. +max_inspections | The multiplication factor for `shard_size`. The maximum number of candidate suggestions OpenSearch inspects to find suggestions is calculated as `shard_size` multiplied by `max_inspection`. May improve accuracy at the cost of decreased performance. Default is 5. +min_doc_freq | The minimum number or percentage of documents in which a suggestion should appear for it to be returned. May improve accuracy by returning only suggestions with high shard-level document frequencies. Valid values are integers that represent the document frequency or floats in the [0, 1] range that represent the percentage of documents. Default is 0 (feature disabled). +max_term_freq | The maximum number of documents in which a suggestion should appear in order for it to be returned. Valid values are integers that represent the document frequency or floats in the [0, 1] range that represent the percentage of documents. Default is 0.01. Excluding high-frequency terms improves spellcheck performance because high-frequency terms are usually spelled correctly. Uses shard-level document frequencies. +string_distance | The edit distance algorithm to use to determine similarity. Valid values are:
- `internal`: The default algorithm that is based on the [Damerau-Levenshtein algorithm](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) but is highly optimized for comparing edit distances for terms in the index.
- `damerau_levenshtein`: The edit distance algorithm based on the [Damerau-Levenshtein algorithm](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance).
- `levenshtein`: The edit distance algorithm based on the [Levenshtein edit distance algorithm](https://en.wikipedia.org/wiki/Levenshtein_distance).
- `jaro_winkler`: The edit distance algorithm based on the [Jaro-Winkler algorithm](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance).
- `ngram`: The edit distance algorithm based on character n-grams. + +## Phrase suggester + +To implement `did-you-mean`, use a phrase suggester. +The phrase suggester is similar to the term suggester, except it uses n-gram language models to suggest whole phrases instead of individual words. + +To set up a phrase suggester, create a custom analyzer called `trigram` that uses a `shingle` filter and lowercases tokens. This filter is similar to the `edge_ngram` filter, but it applies to words instead of letters. Then configure the field from which you'll be sourcing suggestions with the custom analyzer you created: + +```json +PUT books2 +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "trigram": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "shingle" + ] + } + }, + "filter": { + "shingle": { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 3 + } + } + } + } + }, + "mappings": { + "properties": { + "title": { + "type": "text", + "fields": { + "trigram": { + "type": "text", + "analyzer": "trigram" + } + } + } + } + } +} +``` + +Index the documents into the new index: + +```json +PUT books2/_doc/1 +{ + "title": "Design Patterns" +} + +PUT books2/_doc/2 +{ + "title": "Software Architecture Patterns Explained" +} +``` + +Suppose the user searches for an incorrect phrase: + +```json +GET books2/_search +{ + "suggest": { + "phrase-check": { + "text": "design paterns", + "phrase": { + "field": "title.trigram" + } + } + } +} +``` + +The phrase suggester returns the corrected phrase: + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "phrase-check" : [ + { + "text" : "design paterns", + "offset" : 0, + "length" : 14, + "options" : [ + { + "text" : "design patterns", + "score" : 0.31666178 + } + ] + } + ] + } +} +``` + +To highlight suggestions, set up the [`highlight`]({{site.url}}{{site.baseurl}}/opensearch/search/highlight) field for the phrase suggester: + +```json +GET books2/_search +{ + "suggest": { + "phrase-check": { + "text": "design paterns", + "phrase": { + "field": "title.trigram", + "gram_size": 3, + "highlight": { + "pre_tag": "", + "post_tag": "" + } + } + } + } +} +``` + +The results contain the highlighted text: + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "phrase-check" : [ + { + "text" : "design paterns", + "offset" : 0, + "length" : 14, + "options" : [ + { + "text" : "design patterns", + "highlighted" : "design patterns", + "score" : 0.31666178 + } + ] + } + ] + } +} +``` + +### Phrase suggester options + +You can specify the following options to the phrase suggester. + +Option | Description +:--- | :--- +field | The field to use for n-gram lookups. The phrase suggester uses this field to calculate suggestion scores. Required. +gram_size | The maximum size `n` of the n-grams (shingles) in the field. If the field does not contain n-grams (shingles), omit this option or set it to 1. If the field uses a shingle filter, and `gram_size` is not set, `gram_size` is set to `max_shingle_size`. +real_word_error_likelihood | The probability that a term is misspelled, even if it exists in the dictionary. Default is 0.95 (5% of the words in the dictionary are misspelled). +confidence | The confidence level is a float factor that is multiplied by the input phrase's score to calculate a threshold score for other suggestions. Only suggestions with higher scores than the threshold are returned. A confidence level of 1.0 will only return suggestions that score higher than the input phrase. If `confidence` is set to 0, the top `size` candidates are returned. Default is 1. +max_errors | The maximum number or percentage of the terms that can be erroneous (spelled incorrectly) in order to return a suggestion. Valid values are integers that represent the number of terms or floats in the (0, 1) range that represent the percentage of the terms. Default is 1 (return only suggestions with at most one misspelled term). Setting this value to a high number can decrease performance. We recommend setting `max_errors` to a low number like 1 or 2 to reduce the time spent in suggest calls relative to the time spent in query execution. +separator | The separator for the terms in the bigram field. Defaults to the space character. +size | The number of candidate suggestions to generate for each query term. Specifying a higher value can result in terms with higher edit distances being returned. Default is 5. +analyzer | The analyzer with which to analyze the suggestion text. Defaults to the analyzer configured for the `field`. +shard_size | The maximum number of candidate suggestions to obtain from each shard. After all candidate suggestions are considered, the top `shard_size` suggestions are returned. Default is 5. +[collate](#collate-field)| Used to prune suggestions for which there are no matching documents in the index. +collate.query | Specifies a query against which suggestions are checked to prune the suggestions for which there are no matching documents in the index. +collate.prune | Specifies whether to return all suggestions. If `prune` is set to `false`, only those suggestions that have matching documents are returned. If `prune` is set to `true`, all suggestions are returned; each suggestion has an additional `collate_match` field that is `true` if the suggestion has matching documents and is `false` otherwise. Default is `false`. +highlight | Configures suggestion highlighting. Both `pre_tag` and `post_tag` values are required. +highlight.pre_tag | The starting tag for highlighting. +highlight.post_tag | The ending tag for highlighting. +[smoothing](#smoothing-models) | Smoothing model to balance the weight of the shingles that exist in the index frequently with the weight of the shingles that exist in the index infrequently. + + +### Collate field + +To filter out spellchecked suggestions that will not return any results, you can use the `collate` field. This field contains a scripted query that is run for each returned suggestion. See [Search templates]({{site.url}}{{site.baseurl}}/opensearch/search-template) for information on constructing a templated query. You can specify the current suggestion using the `{% raw %}{{suggestion}}{% endraw %}` variable, or you can pass your own template parameters in the `params` field (the suggestion value will be added to the variables you specify). + +The collate query for a suggestion is run only on the shard from which the suggestion was sourced. The query is required. + +Additionally, if the `prune` parameter is set to `true`, a `collate_match` field is added to each suggestion. If a query returns no results, the `collate_match` value is `false`. You can then filter out suggestions based on the `collate_match` field. The `prune` parameter's default value is `false`. + +For example, the following query configures the `collate` field to run a `match_phrase` query matching the `title` field to the current suggestion: + +```json +GET books2/_search +{ + "suggest": { + "phrase-check": { + "text": "design paterns", + "phrase": { + "field": "title.trigram", + "collate" : { + "query" : { + "source": { + "match_phrase" : { + "title": "{{suggestion}}" + } + } + }, + "prune": "true" + } + } + } + } +} +``` + +The resulting suggestion contains the `collate_match` field set to `true`, which means the `match_phrase` query will return matching documents for the suggestion: + +```json +{ + "took" : 7, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "suggest" : { + "phrase-check" : [ + { + "text" : "design paterns", + "offset" : 0, + "length" : 14, + "options" : [ + { + "text" : "design patterns", + "score" : 0.56759655, + "collate_match" : true + } + ] + } + ] + } +} +``` + + +### Smoothing models + +For most use cases, when calculating a suggestion's score, you want to take into account not only the frequency of a shingle but also the shingle's size. Smoothing models are used to calculate scores for shingles of different sizes, balancing the weight of frequent and infrequent shingles. + +The following smoothing models are supported. + +Model | Description +:--- | :--- +stupid_backoff | Backs off to lower-order n-gram models if the higher-order n-gram count is 0 and multiplies the lower-order n-gram model by a constant factor (`discount`). This is the default smoothing model. +stupid.backoff.discount | The factor by which to multiply the lower-order n-gram model. Optional. Default is 0.4. +laplace | Uses additive smoothing, adding a constant `alpha` to all counts to balance weights. +laplace.alpha | The constant added to all counts to balance weights, typically 1.0 or smaller. Optional. Default is 0.5. + +By default, OpenSearch uses the Stupid Backoff model—a simple algorithm that starts with the shingles of the highest order and takes lower-order shingles if higher-order shingles are not found. For example, if you set up the phrase suggester to have 3-grams, 2-grams, and 1-grams, the Stupid Backoff model first inspects the 3-grams. If there are no 3-grams, it inspects 2-grams but multiplies the score by the `discount` factor. If there are no 2-grams, it inspects 1-grams but again multiplies the score by the `discount` factor. The Stupid Backoff model works well in most cases. If you need to choose the Laplace smoothing model, specify it in the `smoothing` parameter: + +```json +GET books2/_search +{ + "suggest": { + "phrase-check": { + "text": "design paterns", + "phrase": { + "field": "title.trigram", + "size" : 1, + "smoothing" : { + "laplace" : { + "alpha" : 0.7 + } + } + } + } + } +} +``` + +### Candidate generators + +Candidate generators provide possible suggestion terms based on the terms in the input text. There is one candidate generator available—`direct_generator`. A direct generator functions similarly to a term suggester: It is also called for each term in the input text. The phrase suggester supports multiple candidate generators, where each generator is called for each term in the input text. It also lets you specify a pre-filter (an analyzer that analyzes the input text terms before they enter the spellcheck phase) and a post-filter (an analyzer that analyzes the generated suggestions before they are returned). + +Set up a direct generator for a phrase suggester: + +```json +GET books2/_search +{ + "suggest": { + "text": "design paterns", + "phrase-check": { + "phrase": { + "field": "title.trigram", + "size": 1, + "direct_generator": [ + { + "field": "title.trigram", + "suggest_mode": "always", + "min_word_length": 3 + } + ] + } + } + } +} +``` + +You can specify the following direct generator options. + +Option | Description +:--- | :--- +field | The field from which to source suggestions. Required. Can be set for each suggestion or globally. +size | The maximum number of suggestions to return for each token in the input text. +suggest_mode | The suggest mode specifies the terms for which suggestions generated on each shard should be included. The suggest mode is applied to suggestions for each shard and is not checked when combining suggestions from different shards. Therefore, if the suggest mode is `missing`, suggestions will be returned if the term is missing from one shard but exists on another shard. Valid values are:
- `missing`: Return suggestions only for the input text terms that are not in the shard.
- `popular`: Return suggestions only if they occur in the documents more frequently than in the original input text on the shard.
- `always`: Always return suggestions.
Default is `missing`. +max_edits | The maximum edit distance for suggestions. Valid values are in the [1, 2] range. Default is 2. +prefix_length | An integer that specifies the minimum length the matched prefix must be to start returning suggestions. If the prefix of `prefix_length` is not matched but the search term is still within the edit distance, no suggestions are returned. Default is 1. Higher values improve spellcheck performance because misspellings don’t tend to occur in the beginning of words. +min_word_length | The minimum length a suggestion must be in order to be included. Default is 4. +max_inspections | The multiplication factor for `shard_size`. The maximum number of candidate suggestions OpenSearch inspects to find suggestions is calculated as `shard_size` multiplied by `max_inspection`. May improve accuracy at the cost of decreased performance. Default is 5. +min_doc_freq | The minimum number or percentage of documents in which a suggestion should appear in order for it to be returned. May improve accuracy by returning only suggestions with high shard-level document frequencies. Valid values are integers that represent the document frequency or floats in the [0, 1] range that represent the percentage of documents. Default is 0 (feature disabled). +max_term_freq | The maximum number of documents in which a suggestion should appear in order for it to be returned. Valid values are integers that represent the document frequency or floats in the [0, 1] range that represent the percentage of documents. Default is 0.01. Excluding high-frequency terms improves spellcheck performance because high-frequency terms are usually spelled correctly. Uses shard-level document frequencies. +pre_filter | An analyzer that is applied to each input text token passed to the generator before a suggestion is generated. +post_filter | An analyzer that is applied to each generated suggestion before it is passed to the phrase scorer. diff --git a/_opensearch/search/highlight.md b/_opensearch/search/highlight.md new file mode 100644 index 0000000000..b58da944b2 --- /dev/null +++ b/_opensearch/search/highlight.md @@ -0,0 +1,964 @@ +--- +layout: default +title: Highlight query matches +parent: Searching data +nav_order: 23 +--- + +# Highlight query matches + +Highlighting emphasizes the search term(s) in the results so you can emphasize the query matches. + +To highlight the search terms, add a `highlight` parameter outside of the query block: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": "life" + } + }, + "size": 3, + "highlight": { + "fields": { + "text_entry": {} + } + } +} +``` + +Each document in the results contains a `highlight` object that shows your search term wrapped in an `em` tag: + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 805, + "relation" : "eq" + }, + "max_score" : 7.450247, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "33765", + "_score" : 7.450247, + "_source" : { + "type" : "line", + "line_id" : 33766, + "play_name" : "Hamlet", + "speech_number" : 60, + "line_number" : "2.2.233", + "speaker" : "HAMLET", + "text_entry" : "my life, except my life." + }, + "highlight" : { + "text_entry" : [ + "my life, except my life." + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "51877", + "_score" : 6.873042, + "_source" : { + "type" : "line", + "line_id" : 51878, + "play_name" : "King Lear", + "speech_number" : 18, + "line_number" : "4.6.52", + "speaker" : "EDGAR", + "text_entry" : "The treasury of life, when life itself" + }, + "highlight" : { + "text_entry" : [ + "The treasury of life, when life itself" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "39245", + "_score" : 6.6167283, + "_source" : { + "type" : "line", + "line_id" : 39246, + "play_name" : "Henry V", + "speech_number" : 7, + "line_number" : "4.7.31", + "speaker" : "FLUELLEN", + "text_entry" : "mark Alexanders life well, Harry of Monmouths life" + }, + "highlight" : { + "text_entry" : [ + "mark Alexanders life well, Harry of Monmouths life" + ] + } + } + ] + } +} +``` + +The highlight function works on the actual field contents. OpenSearch retrieves these contents either from the stored field (the field for which the mapping is to be set to `true`) or from the `_source` field if the field is not stored. You can force the retrieval of field contents from the `_source` field by setting the `force_source` parameter to `true`. + +The `highlight` parameter highlights the original terms even when using synonyms or stemming for the search itself. +{: .note} + +## Methods of obtaining offsets + +To highlight the search terms, the highlighter needs the start and end character offsets of each term. The offsets mark the term's position in the original text. The highlighter can obtain the offsets from the following sources: + +- **Postings**: When documents are indexed, OpenSearch creates an inverted search index—a core data structure used to search for documents. Postings represent the inverted search index and store the mapping of each analyzed term to the list of documents in which it occurs. If you set the `index_options` parameter to `offsets` when mapping a [text field]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text), OpenSearch adds each term's start and end character offsets to the inverted index. During highlighting, the highlighter reruns the original query directly on the postings to locate each term. Thus, storing offsets makes highlighting more efficient for large fields because it does not require reanalyzing the text. Storing term offsets requires additional disk space, but uses less disk space than storing term vectors. + +- [**Term vectors**]: If you set the [`term_vector` parameter]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/text#term-vector-parameter) to `with_positions_offsets` when mapping a text field, the highlighter uses the `term_vector` to highlight the field. Storing term vectors requires the most disk space. However, it makes highlighting faster for fields larger than 1 MB and for multi-term queries like prefix or wildcard because term vectors provide access to the dictionary of terms for each document. + +- **Reanalyzing text**: In the absence of both postings and term vectors, the highlighter reanalyzes text in order to highlight it. For every document and every field that needs highlighting, the highlighter creates a small in-memory index and reruns the original query through Lucene’s query execution planner to access low-level match information for the current document. Reanalyzing the text works well in most use cases. However, this method is more memory and time intensive for large fields. + +## Highlighter types + +OpenSearch supports three highlighter implementations: `plain`, `unified`, and `fvh` (Fast Vector Highlighter). + +The following table lists the methods of obtaining the offsets for each highlighter. + +Highlighter | Method of obtaining offsets +:--- | :--- +[`unified`](#the-unified-highlighter) | Term vectors if `term_vector` is set to `with_positions_offsets`,
postings if `index_options` is set to `offsets`,
reanalyzing text otherwise. +[`fvh`](#the-fvh-highlighter) | Term vectors. +[`plain`](#the-plain-highlighter) | Reanalyzing text. + +### Setting the highlighter type + +To set the highlighter type, specify it in the `type` field: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": "life" + } + }, + "highlight": { + "fields": { + "text_entry": { "type": "plain"} + } + } +} +``` + +### The `unified` highlighter + +The `unified` highlighter is based on the Lucene Unified Highlighter and is the default highlighter for OpenSearch. It divides the text into sentences and treats those sentences as individual documents, scoring them in terms of similarity using the BM25 algorithm. The `unified` highlighter supports both exact phrase and multi-term highlighting, including fuzzy, prefix, and regex. If you're using complex queries to highlight multiple fields in multiple documents, we recommend using the `unified` highlighter on `postings` or `term_vector` fields. + +### The `fvh` highlighter + +The `fvh` highlighter is based on the Lucene Fast Vector Highlighter. To use this highlighter, you need to store term vectors with positions offsets, which increases the index size. The `fvh` highlighter can combine matched terms from multiple fields into one result. It can also assign weights to matches depending on their positions; thus, you can sort phrase matches above term matches when highlighting a query that boosts phrase matches over term matches. Additionally, you can configure the `fvh` highlighter to select the boundaries of a returned text fragment, and you can highlight multiple words with different tags. + +### The `plain` highlighter + +The `plain` highlighter is based on the standard Lucene highlighter. It requires the highlighted fields to be stored either individually or in the `_source` field. The `plain` highlighter mirrors the query matching logic, in particular word importance and positions in phrase queries. It works for most use cases but may be slow for large fields because it has to reanalyze the text to be highlighted. + +## Highlighting options + +The following table describes the highlighting options you can specify on a global or field level. Field-level settings override global settings. + +Option | Description +:--- | :--- +type | Specifies the highlighter to use. Valid values are `unified`, `fvh`, and `plain`. Default is `unified`. +fields | Specifies the fields to search for text to be highlighted. Supports wildcard expressions. If you use wildcards, only `text` and `keyword` fields are highlighted. For example, you can set `fields` to `my_field*` to include all `text` and `keyword` fields that start with the prefix `my_field`. +force_source | Specifies that field values for highlighting should be obtained from the `_source` field rather than from stored field values. Default is `false`. +require_field_match | Specifies whether to highlight only fields that contain a search query match. Default is `true`. To highlight all fields, set this option to `false`. +pre_tags | Specifies the HTML start tags for the highlighted text as an array of strings. +post_tags | Specifies the HTML end tags for the highlighted text as an array of strings. +tags_schema | If you set this option to `styled`, OpenSearch uses the built-in tag schema. In this schema, the `pre_tags` are ``, ``, ``, ``, ``, ``, ``, ``, ``, and ``, and the `post_tags` is ``. +boundary_chars | All boundary characters combined in a string.
Default is `".,!? \t\n"`. +boundary_scanner | Valid only for the `unified` and `fvh` highlighters. Specifies whether to split the highlighted fragments into sentences, words, or characters. Valid values are the following:
- `sentence`: Split highlighted fragments at sentence boundaries, as defined by the [BreakIterator](https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html). You can specify the BreakIterator's locale in the `boundary_scanner_locale` option.
- `word`: Split highlighted fragments at word boundaries, as defined by the [BreakIterator](https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html). You can specify the BreakIterator's locale in the `boundary_scanner_locale` option.
- `chars`: Split highlighted fragments at any character listed in `boundary_chars`. Valid only for the `fvh` highlighter. +boundary_scanner_locale | Provides a [locale](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html) for the `boundary_scanner`. Valid values are language tags (for example, `"en-US"`). Default is [Locale.ROOT](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html#ROOT). +boundary_max_scan | Controls how far to scan for boundary characters when the `boundary_scanner` parameter for the `fvh` highlighter is set to `chars`. Default is 20. +encoder | Specifies whether the highlighted fragment should be HTML encoded before it is returned. Valid values are `default` (no encoding) or `html` (first escape the HTML text and then insert the highlighting tags). For example, if the field text is `

Hamlet

` and the `encoder` is set to `html`, the highlighted text is `"<h3>Hamlet</h3>"`. +fragmenter | Specifies how to split text into highlighted fragments. Valid only for the `plain` highlighter. Valid values are the following:
- `span` (default): Splits text into fragments of the same size but tries not to split text between highlighted terms.
- `simple`: Splits text into fragments of the same size. +fragment_offset | Specifies the character offset from which you want to start highlighting. Valid for the `fvh` highlighter only. +fragment_size | The size of a highlighted fragment, specified as the number of characters. If `number_of_fragments` is set to 0, `fragment_size` is ignored. Default is 100. +number_of_fragments| The maximum number of returned fragments. If `number_of_fragments` is set to 0, OpenSearch returns the highlighted contents of the entire field. Default is 5. +order | The sort order for the highlighted fragments. Set `order` to `score` to sort fragments by relevance. Each highlighter has a different algorithm for calculating relevance scores. Default is `none`. +highlight_query | Specifies that matches for a query other than the search query should be highlighted. The `highlight_query` option is useful when you use a faster query to get document matches and a slower query (for example, `rescore_query`) to refine the results. We recommend to include the search query as part of the `highlight_query`. +matched_fields | Combines matches from different fields to highlight one field. The most common use case for this functionality is highlighting text that is analyzed in different ways and kept in multi-fields. All fields in the `matched_fields` list must have the `term_vector` field set to `with_positions_offsets`. The field in which the matches are combined is the only loaded field, so it is beneficial to set its `store` option to `yes`. Valid only for the `fvh` highlighter. +no_match_size | Specifies the number of characters, starting from the beginning of the field, to return if there are no matching fragments to highlight. Default is 0. +phrase_limit | The number of matching phrases in a document that are considered. Limits the number of phrases to analyze by the `fvh` highlighter to avoid consuming a lot of memory. If `matched_fields` are used, `phrase_limit` specifies the number of phrases for each matched field. A higher `phrase_limit` leads to increased query time and more memory consumption. Valid only for the `fvh` highlighter. Default is 256. + +The unified highlighter's sentence scanner splits sentences larger than `fragment_size` at the first word boundary after `fragment_size` is reached. To return whole sentences without splitting them, set `fragment_size` to 0. +{: .note} + +## Changing the highlighting tags + +Design your application code to parse the results from the `highlight` object and perform an action on the search terms, such as changing their color, bolding, italicizing, and so on. + +To change the default `em` tags, specify the new tags in the `pretag` and `posttag` parameters: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "play_name": "Henry IV" + } + }, + "size": 3, + "highlight": { + "pre_tags": [ + "" + ], + "post_tags": [ + "" + ], + "fields": { + "play_name": {} + } + } +} +``` + +The play name is highlighted by the new tags in the response: + +```json +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3205, + "relation" : "eq" + }, + "max_score" : 3.548232, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "0", + "_score" : 3.548232, + "_source" : { + "type" : "act", + "line_id" : 1, + "play_name" : "Henry IV", + "speech_number" : "", + "line_number" : "", + "speaker" : "", + "text_entry" : "ACT I" + }, + "highlight" : { + "play_name" : [ + "Henry IV" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "1", + "_score" : 3.548232, + "_source" : { + "type" : "scene", + "line_id" : 2, + "play_name" : "Henry IV", + "speech_number" : "", + "line_number" : "", + "speaker" : "", + "text_entry" : "SCENE I. London. The palace." + }, + "highlight" : { + "play_name" : [ + "Henry IV" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "2", + "_score" : 3.548232, + "_source" : { + "type" : "line", + "line_id" : 3, + "play_name" : "Henry IV", + "speech_number" : "", + "line_number" : "", + "speaker" : "", + "text_entry" : "Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others" + }, + "highlight" : { + "play_name" : [ + "Henry IV" + ] + } + } + ] + } +} +``` + +## Specifying a highlight query + +By default, OpenSearch only considers the search query for highlighting. If you use a fast query to get document matches and a slower query like `rescore_query` to refine the results, it is useful to highlight the refined results. You can do this by adding a `highlight_query`: + +```json +GET shakespeare/_search +{ + "query": { + "match": { + "text_entry": { + "query": "thats my name" + } + } + }, + "rescore": { + "window_size": 20, + "query": { + "rescore_query": { + "match_phrase": { + "text_entry": { + "query": "thats my name", + "slop": 1 + } + } + }, + "rescore_query_weight": 5 + } + }, + "_source": false, + "highlight": { + "order": "score", + "fields": { + "text_entry": { + "highlight_query": { + "bool": { + "must": { + "match": { + "text_entry": { + "query": "thats my name" + } + } + }, + "should": { + "match_phrase": { + "text_entry": { + "query": "that is my name", + "slop": 1, + "boost": 10.0 + } + } + }, + "minimum_should_match": 0 + } + } + } + } + } +} +``` + +## Combining matches from different fields to highlight one field + +You can combine matches from different fields to highlight one field with the `fvh` highlighter. The most common use case for this functionality is highlighting text that is analyzed in different ways and kept in multi-fields. All fields in the `matched_fields` list must have the `term_vector` field set to `with_positions_offsets`. The field in which the matches are combined is the only loaded field, so it is beneficial to set its `store` option to `yes`. + +### Example + +Create a mapping for the `shakespeare` index where the `text_entry` field is analyzed with the `standard` analyzer and has an `english` subfield that is analyzed with the `english` analyzer: + +```json +PUT shakespeare +{ + "mappings" : { + "properties" : { + "text_entry" : { + "type" : "text", + "term_vector": "with_positions_offsets", + "fields": { + "english": { + "type": "text", + "analyzer": "english", + "term_vector": "with_positions_offsets" + } + } + } + } + } +} +``` + +The `standard` analyzer splits the `text_entry` fields into individual words. You can confirm this by using the analyze API operation: + +```json +GET shakespeare/_analyze +{ + "text": "bragging of thine", + "field": "text_entry" +} +``` + +The response contains the original string split on white space: + +```json +{ + "tokens" : [ + { + "token" : "bragging", + "start_offset" : 0, + "end_offset" : 8, + "type" : "", + "position" : 0 + }, + { + "token" : "of", + "start_offset" : 9, + "end_offset" : 11, + "type" : "", + "position" : 1 + }, + { + "token" : "thine", + "start_offset" : 12, + "end_offset" : 17, + "type" : "", + "position" : 2 + } + ] +} +``` + +The `english` analyzer not only splits the string into words but also stems the tokens and removes stopwords. You can confirm this by using the analyze API operation with the `text_entry.english` field: + +```json +GET shakespeare/_analyze +{ + "text": "bragging of thine", + "field": "text_entry.english" +} +``` + +The response contains the stemmed words: + +```json +{ + "tokens" : [ + { + "token" : "brag", + "start_offset" : 0, + "end_offset" : 8, + "type" : "", + "position" : 0 + }, + { + "token" : "thine", + "start_offset" : 12, + "end_offset" : 17, + "type" : "", + "position" : 2 + } + ] +} +``` + +To search for all forms of the word `bragging`, use the following query: + +```json +GET shakespeare/_search +{ + "query": { + "query_string": { + "query": "text_entry.english:bragging", + "fields": [ + "text_entry" + ] + } + }, + "highlight": { + "order": "score", + "fields": { + "text_entry": { + "matched_fields": [ + "text_entry", + "text_entry.english" + ], + "type": "fvh" + } + } + } +} +``` + +The response highlights all versions of the word "bragging" in the `text_entry` field: + +```json +{ + "took" : 5, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 26, + "relation" : "eq" + }, + "max_score" : 10.153671, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "56666", + "_score" : 10.153671, + "_source" : { + "type" : "line", + "line_id" : 56667, + "play_name" : "macbeth", + "speech_number" : 34, + "line_number" : "2.3.118", + "speaker" : "MACBETH", + "text_entry" : "Is left this vault to brag of." + }, + "highlight" : { + "text_entry" : [ + "Is left this vault to brag of." + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "71445", + "_score" : 9.284528, + "_source" : { + "type" : "line", + "line_id" : 71446, + "play_name" : "Much Ado about nothing", + "speech_number" : 18, + "line_number" : "5.1.65", + "speaker" : "LEONATO", + "text_entry" : "As under privilege of age to brag" + }, + "highlight" : { + "text_entry" : [ + "As under privilege of age to brag" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "86782", + "_score" : 9.284528, + "_source" : { + "type" : "line", + "line_id" : 86783, + "play_name" : "Romeo and Juliet", + "speech_number" : 8, + "line_number" : "2.6.31", + "speaker" : "JULIET", + "text_entry" : "Brags of his substance, not of ornament:" + }, + "highlight" : { + "text_entry" : [ + "Brags of his substance, not of ornament:" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "44531", + "_score" : 8.552448, + "_source" : { + "type" : "line", + "line_id" : 44532, + "play_name" : "King John", + "speech_number" : 15, + "line_number" : "3.1.124", + "speaker" : "CONSTANCE", + "text_entry" : "A ramping fool, to brag and stamp and swear" + }, + "highlight" : { + "text_entry" : [ + "A ramping fool, to brag and stamp and swear" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "63208", + "_score" : 8.552448, + "_source" : { + "type" : "line", + "line_id" : 63209, + "play_name" : "Merchant of Venice", + "speech_number" : 11, + "line_number" : "3.4.79", + "speaker" : "PORTIA", + "text_entry" : "A thousand raw tricks of these bragging Jacks," + }, + "highlight" : { + "text_entry" : [ + "A thousand raw tricks of these bragging Jacks," + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "73026", + "_score" : 8.552448, + "_source" : { + "type" : "line", + "line_id" : 73027, + "play_name" : "Othello", + "speech_number" : 75, + "line_number" : "2.1.242", + "speaker" : "IAGO", + "text_entry" : "but for bragging and telling her fantastical lies:" + }, + "highlight" : { + "text_entry" : [ + "but for bragging and telling her fantastical lies:" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "85974", + "_score" : 8.552448, + "_source" : { + "type" : "line", + "line_id" : 85975, + "play_name" : "Romeo and Juliet", + "speech_number" : 20, + "line_number" : "1.5.70", + "speaker" : "CAPULET", + "text_entry" : "And, to say truth, Verona brags of him" + }, + "highlight" : { + "text_entry" : [ + "And, to say truth, Verona brags of him" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "96800", + "_score" : 8.552448, + "_source" : { + "type" : "line", + "line_id" : 96801, + "play_name" : "Titus Andronicus", + "speech_number" : 60, + "line_number" : "1.1.311", + "speaker" : "SATURNINUS", + "text_entry" : "Agree these deeds with that proud brag of thine," + }, + "highlight" : { + "text_entry" : [ + "Agree these deeds with that proud brag of thine," + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "18189", + "_score" : 7.9273787, + "_source" : { + "type" : "line", + "line_id" : 18190, + "play_name" : "As you like it", + "speech_number" : 12, + "line_number" : "5.2.30", + "speaker" : "ROSALIND", + "text_entry" : "and Caesars thrasonical brag of I came, saw, and" + }, + "highlight" : { + "text_entry" : [ + "and Caesars thrasonical brag of I came, saw, and" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "32054", + "_score" : 7.9273787, + "_source" : { + "type" : "line", + "line_id" : 32055, + "play_name" : "Cymbeline", + "speech_number" : 52, + "line_number" : "5.5.211", + "speaker" : "IACHIMO", + "text_entry" : "And then a mind put int, either our brags" + }, + "highlight" : { + "text_entry" : [ + "And then a mind put int, either our brags" + ] + } + } + ] + } +} +``` + +To score the original form of the word "bragging" higher, you can boost the `text_entry` field: + +```json +GET shakespeare/_search +{ + "query": { + "query_string": { + "query": "bragging", + "fields": [ + "text_entry^5", + "text_entry.english" + ] + } + }, + "highlight": { + "order": "score", + "fields": { + "text_entry": { + "matched_fields": [ + "text_entry", + "text_entry.english" + ], + "type": "fvh" + } + } + } +} +``` + +The response lists documents that contain the word "bragging" first: + +```json +{ + "took" : 17, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 26, + "relation" : "eq" + }, + "max_score" : 49.746853, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "45739", + "_score" : 49.746853, + "_source" : { + "type" : "line", + "line_id" : 45740, + "play_name" : "King John", + "speech_number" : 10, + "line_number" : "5.1.51", + "speaker" : "BASTARD", + "text_entry" : "Of bragging horror: so shall inferior eyes," + }, + "highlight" : { + "text_entry" : [ + "Of bragging horror: so shall inferior eyes," + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "63208", + "_score" : 47.077244, + "_source" : { + "type" : "line", + "line_id" : 63209, + "play_name" : "Merchant of Venice", + "speech_number" : 11, + "line_number" : "3.4.79", + "speaker" : "PORTIA", + "text_entry" : "A thousand raw tricks of these bragging Jacks," + }, + "highlight" : { + "text_entry" : [ + "A thousand raw tricks of these bragging Jacks," + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "68474", + "_score" : 47.077244, + "_source" : { + "type" : "line", + "line_id" : 68475, + "play_name" : "A Midsummer nights dream", + "speech_number" : 101, + "line_number" : "3.2.427", + "speaker" : "PUCK", + "text_entry" : "Thou coward, art thou bragging to the stars," + }, + "highlight" : { + "text_entry" : [ + "Thou coward, art thou bragging to the stars," + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "73026", + "_score" : 47.077244, + "_source" : { + "type" : "line", + "line_id" : 73027, + "play_name" : "Othello", + "speech_number" : 75, + "line_number" : "2.1.242", + "speaker" : "IAGO", + "text_entry" : "but for bragging and telling her fantastical lies:" + }, + "highlight" : { + "text_entry" : [ + "but for bragging and telling her fantastical lies:" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "39816", + "_score" : 44.679565, + "_source" : { + "type" : "line", + "line_id" : 39817, + "play_name" : "Henry V", + "speech_number" : 28, + "line_number" : "5.2.138", + "speaker" : "KING HENRY V", + "text_entry" : "armour on my back, under the correction of bragging" + }, + "highlight" : { + "text_entry" : [ + "armour on my back, under the correction of bragging" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "63200", + "_score" : 44.679565, + "_source" : { + "type" : "line", + "line_id" : 63201, + "play_name" : "Merchant of Venice", + "speech_number" : 11, + "line_number" : "3.4.71", + "speaker" : "PORTIA", + "text_entry" : "Like a fine bragging youth, and tell quaint lies," + }, + "highlight" : { + "text_entry" : [ + "Like a fine bragging youth, and tell quaint lies," + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "56666", + "_score" : 10.153671, + "_source" : { + "type" : "line", + "line_id" : 56667, + "play_name" : "macbeth", + "speech_number" : 34, + "line_number" : "2.3.118", + "speaker" : "MACBETH", + "text_entry" : "Is left this vault to brag of." + }, + "highlight" : { + "text_entry" : [ + "Is left this vault to brag of." + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "71445", + "_score" : 9.284528, + "_source" : { + "type" : "line", + "line_id" : 71446, + "play_name" : "Much Ado about nothing", + "speech_number" : 18, + "line_number" : "5.1.65", + "speaker" : "LEONATO", + "text_entry" : "As under privilege of age to brag" + }, + "highlight" : { + "text_entry" : [ + "As under privilege of age to brag" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "86782", + "_score" : 9.284528, + "_source" : { + "type" : "line", + "line_id" : 86783, + "play_name" : "Romeo and Juliet", + "speech_number" : 8, + "line_number" : "2.6.31", + "speaker" : "JULIET", + "text_entry" : "Brags of his substance, not of ornament:" + }, + "highlight" : { + "text_entry" : [ + "Brags of his substance, not of ornament:" + ] + } + }, + { + "_index" : "shakespeare", + "_id" : "44531", + "_score" : 8.552448, + "_source" : { + "type" : "line", + "line_id" : 44532, + "play_name" : "King John", + "speech_number" : 15, + "line_number" : "3.1.124", + "speaker" : "CONSTANCE", + "text_entry" : "A ramping fool, to brag and stamp and swear" + }, + "highlight" : { + "text_entry" : [ + "A ramping fool, to brag and stamp and swear" + ] + } + } + ] + } +} +``` + +## Query limitations + +Note the following limitations: + +- When extracting terms to highlight, highlighters don’t reflect the Boolean logic of a query. Therefore, for some complex Boolean queries, such as nested Boolean queries and queries using `minimum_should_match`, OpenSearch may highlight terms that don’t correspond to query matches. +- The `fvh` highlighter does not support span queries. \ No newline at end of file diff --git a/_opensearch/search/index.md b/_opensearch/search/index.md new file mode 100644 index 0000000000..35c6671cd6 --- /dev/null +++ b/_opensearch/search/index.md @@ -0,0 +1,20 @@ +--- +layout: default +title: Searching data +nav_order: 20 +has_children: true +has_toc: false +redirect_from: /opensearch/ux/ +--- + +# Searching data + +What users expect from search engines has evolved over the years. Just returning relevant results quickly is no longer enough for most users. Now users seek methods that allow them to get even more relevant results, to sort and organize results, and to highlight their queries. OpenSearch includes many features, described in the following table, that enhance the search experience. + +Feature | Description +:--- | :--- +[Autocomplete functionality]({{site.url}}{{site.baseurl}}/opensearch/search/autocomplete) | Suggest phrases as the user types. +[Did-you-mean functionality]({{site.url}}{{site.baseurl}}/opensearch/search/autocomplete) | Check spelling of phrases as the user types. +[Paginate results]({{site.url}}{{site.baseurl}}/opensearch/search/paginate) | Rather than a single, long list, separate search results into pages. +[Sort results]({{site.url}}{{site.baseurl}}/opensearch/search/sort) | Allow sorting of results by different criteria. +[Highlight query matches]({{site.url}}{{site.baseurl}}/opensearch/search/highlight) | Highlight the search term in the results. diff --git a/_opensearch/search/paginate.md b/_opensearch/search/paginate.md new file mode 100644 index 0000000000..e319b0b10d --- /dev/null +++ b/_opensearch/search/paginate.md @@ -0,0 +1,274 @@ +--- +layout: default +title: Paginate results +parent: Searching data +nav_order: 21 +--- + +## Paginate results + +You can use the following methods to paginate search results in OpenSearch: + +1. The [`from` and `size` parameters](#the-from-and-size-parameters) +1. The [scroll search](#scroll-search) operation +1. The [`search_after` parameter](#the-search_after-parameter) + +## The `from` and `size` parameters + +The `from` and `size` parameters return results one page at a time. + +The `from` parameter is the document number from which you want to start showing the results. The `size` parameter is the number of results that you want to show. Together, they let you return a subset of the search results. + +For example, if the value of `size` is 10 and the value of `from` is 0, you see the first 10 results. If you change the value of `from` to 10, you see the next 10 results (because the results are zero-indexed). So if you want to see results starting from result 11, `from` must be 10. + +```json +GET shakespeare/_search +{ + "from": 0, + "size": 10, + "query": { + "match": { + "play_name": "Hamlet" + } + } +} +``` + +Use the following formula to calculate the `from` parameter relative to the page number: + +```json +from = size * (page_number - 1) +``` + +Each time the user chooses the next page of the results, your application needs to run the same search query with an incremented `from` value. + +You can also specify the `from` and `size` parameters in the search URI: + +```json +GET shakespeare/_search?from=0&size=10 +``` + +If you only specify the `size` parameter, the `from` parameter defaults to 0. + +Querying for pages deep in your results can have a significant performance impact, so OpenSearch limits this approach to 10,000 results. + +The `from` and `size` parameters are stateless, so the results are based on the latest available data. +This can cause inconsistent pagination. +For example, assume a user stays on the first page of the results and then navigates to the second page. During that time, a new document relevant to the user's search is indexed and shows up on the first page. In this scenario, the last result on the first page is pushed to the second page, and the user sees duplicate results (that is, the first and second pages both display that last result). + +Use the `scroll` operation for consistent pagination. The `scroll` operation keeps a search context open for a certain period of time. Any data changes do not affect the results during that time. + + +## Scroll search + +The `from` and `size` parameters allow you to paginate your search results but with a limit of 10,000 results at a time. + +If you need to request volumes of data larger than 1 PB from, for example, a machine learning job, use the `scroll` operation instead. The `scroll` operation allows you to request an unlimited number of results. + +To use the scroll operation, add a `scroll` parameter to the request header with a search context telling OpenSearch for how long you need to keep scrolling. This search context needs to be long enough to process a single batch of results. + +To set the number of results that you want returned for each batch, use the `size` parameter: + +```json +GET shakespeare/_search?scroll=10m +{ + "size": 10000 +} +``` + +OpenSearch caches the results and returns a scroll ID that you can use to access them in batches: + +```json +"_scroll_id" : "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAUWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ==" +``` + +Pass this scroll ID to the `scroll` operation to obtain the next batch of results: + +```json +GET _search/scroll +{ + "scroll": "10m", + "scroll_id": "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAUWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ==" +} +``` + +Using this scroll ID, you get results in batches of 10,000 as long as the search context is still open. Typically, the scroll ID does not change between requests, but it *can* change, so make sure to always use the latest scroll ID. If you don't send the next scroll request within the set search context, the `scroll` operation does not return any results. + +If you expect billions of results, use a sliced scroll. Slicing allows you to perform multiple scroll operations for the same request but in parallel. +Set the ID and the maximum number of slices for the scroll: + +```json +GET shakespeare/_search?scroll=10m +{ + "slice": { + "id": 0, + "max": 10 + }, + "query": { + "match_all": {} + } +} +``` + +With a single scroll ID, you receive 10 results. +You can have up to 10 IDs. +Perform the same command with the ID equal to 1: + +```json +GET shakespeare/_search?scroll=10m +{ + "slice": { + "id": 1, + "max": 10 + }, + "query": { + "match_all": {} + } +} +``` + +Close the search context when you’re done scrolling, because it continues to consume computing resources until the timeout: + +```json +DELETE _search/scroll/DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAcWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ== +``` + +#### Sample Response + +```json +{ + "succeeded": true, + "num_freed": 1 +} +``` + +Use the following request to close all open scroll contexts: + +```json +DELETE _search/scroll/_all +``` + +The `scroll` operation corresponds to a specific timestamp. It doesn't consider documents added after that timestamp as potential results. + +Because open search contexts consume a lot of memory, we suggest you don't use the `scroll` operation for frequent user queries that don't need the search context to be open. Instead, use the `sort` parameter with the `search_after` parameter to scroll responses for user queries. + +## The `search_after` parameter + +The `search_after` parameter provides a live cursor that uses the previous page's results to obtain the next page's results. It is similar to the `scroll` operation in that it is meant to scroll many queries in parallel. + +For example, the following query sorts all lines from the play "Hamlet" by the speech number and then the ID and retrieves the first three results: + +```json +GET shakespeare/_search +{ + "size": 3, + "query": { + "match": { + "play_name": "Hamlet" + } + }, + "sort": [ + { "speech_number": "asc" }, + { "_id": "asc" } + ] +} +``` + +The response contains the `sort` array of values for each document: + +```json +{ + "took" : 7, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 4244, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "32435", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 32436, + "play_name" : "Hamlet", + "speech_number" : 1, + "line_number" : "1.1.1", + "speaker" : "BERNARDO", + "text_entry" : "Whos there?" + }, + "sort" : [ + 1, + "32435" + ] + }, + { + "_index" : "shakespeare", + "_id" : "32634", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 32635, + "play_name" : "Hamlet", + "speech_number" : 1, + "line_number" : "1.2.1", + "speaker" : "KING CLAUDIUS", + "text_entry" : "Though yet of Hamlet our dear brothers death" + }, + "sort" : [ + 1, + "32634" + ] + }, + { + "_index" : "shakespeare", + "_id" : "32635", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 32636, + "play_name" : "Hamlet", + "speech_number" : 1, + "line_number" : "1.2.2", + "speaker" : "KING CLAUDIUS", + "text_entry" : "The memory be green, and that it us befitted" + }, + "sort" : [ + 1, + "32635" + ] + } + ] + } +} +``` + +You can use the last result's `sort` values to retrieve the next result by using the `search_after` parameter: + +```json +GET shakespeare/_search +{ + "size": 10, + "query": { + "match": { + "play_name": "Hamlet" + } + }, + "search_after": [ 1, "32635"], + "sort": [ + { "speech_number": "asc" }, + { "_id": "asc" } + ] +} +``` + +Unlike the `scroll` operation, the `search_after` parameter is stateless, so the document order may change because of documents being indexed or deleted. diff --git a/_opensearch/search/sort.md b/_opensearch/search/sort.md new file mode 100644 index 0000000000..dac96d175a --- /dev/null +++ b/_opensearch/search/sort.md @@ -0,0 +1,880 @@ +--- +layout: default +title: Sort results +parent: Searching data +nav_order: 22 +--- + +## Sort results + +Sorting allows your users to sort results in a way that’s most meaningful to them. + +By default, full-text queries sort results by the relevance score. +You can choose to sort the results by any field value in either ascending or descending order by setting the `order` parameter to `asc` or `desc`. + +For example, to sort results by descending order of a `line_id` value, use the following query: + +```json +GET shakespeare/_search +{ + "query": { + "term": { + "play_name": { + "value": "Henry IV" + } + } + }, + "sort": [ + { + "line_id": { + "order": "desc" + } + } + ] +} +``` + +The results are sorted by `line_id` in descending order: + +```json +{ + "took" : 24, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3205, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ + { + "_index" : "shakespeare", + "_id" : "3204", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3205, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "", + "speaker" : "KING HENRY IV", + "text_entry" : "Exeunt" + }, + "sort" : [ + 3205 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3203", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3204, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.45", + "speaker" : "KING HENRY IV", + "text_entry" : "Let us not leave till all our own be won." + }, + "sort" : [ + 3204 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3202", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3203, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.44", + "speaker" : "KING HENRY IV", + "text_entry" : "And since this business so fair is done," + }, + "sort" : [ + 3203 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3201", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3202, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.43", + "speaker" : "KING HENRY IV", + "text_entry" : "Meeting the cheque of such another day:" + }, + "sort" : [ + 3202 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3200", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3201, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.42", + "speaker" : "KING HENRY IV", + "text_entry" : "Rebellion in this land shall lose his sway," + }, + "sort" : [ + 3201 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3199", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3200, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.41", + "speaker" : "KING HENRY IV", + "text_entry" : "To fight with Glendower and the Earl of March." + }, + "sort" : [ + 3200 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3198", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3199, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.40", + "speaker" : "KING HENRY IV", + "text_entry" : "Myself and you, son Harry, will towards Wales," + }, + "sort" : [ + 3199 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3197", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3198, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.39", + "speaker" : "KING HENRY IV", + "text_entry" : "Who, as we hear, are busily in arms:" + }, + "sort" : [ + 3198 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3196", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3197, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.38", + "speaker" : "KING HENRY IV", + "text_entry" : "To meet Northumberland and the prelate Scroop," + }, + "sort" : [ + 3197 + ] + }, + { + "_index" : "shakespeare", + "_id" : "3195", + "_score" : null, + "_source" : { + "type" : "line", + "line_id" : 3196, + "play_name" : "Henry IV", + "speech_number" : 8, + "line_number" : "5.5.37", + "speaker" : "KING HENRY IV", + "text_entry" : "Towards York shall bend you with your dearest speed," + }, + "sort" : [ + 3196 + ] + } + ] + } +} +``` + +The `sort` parameter is an array, so you can specify multiple field values in the order of their priority. + +If you have two fields with the same value for `line_id`, OpenSearch uses `speech_number`, which is the second option for sorting: + +```json +GET shakespeare/_search +{ + "query": { + "term": { + "play_name": { + "value": "Henry IV" + } + } + }, + "sort": [ + { + "line_id": { + "order": "desc" + } + }, + { + "speech_number": { + "order": "desc" + } + } + ] +} +``` + +You can continue to sort by any number of field values to get the results in just the right order. It doesn’t have to be a numerical value—you can also sort by date or timestamp fields: + +```json +"sort": [ + { + "date": { + "order": "desc" + } + } + ] +``` + +A text field that is analyzed cannot be used to sort documents, because the inverted index only contains the individual tokenized terms and not the entire string. So you cannot sort by the `play_name`, for example. + +To bypass this limitation, you can use a raw version of the text field mapped as a keyword type. In the following example, `play_name.keyword` is not analyzed and you have a copy of the full original version for sorting purposes: + +```json +GET shakespeare/_search +{ + "query": { + "term": { + "play_name": { + "value": "Henry IV" + } + } + }, + "sort": [ + { + "play_name.keyword": { + "order": "desc" + } + } + ] +} +``` + +The results are sorted by the `play_name` field in alphabetical order. + +Use `sort` with the [`search_after` parameter]({{site.url}}{{site.baseurl}}/opensearch/search/paginate#the-search_after-parameter) for more efficient scrolling. +The results start with the document that comes after the sort values you specify in the `search_after` array. + +Make sure you have the same number of values in the `search_after` array as in the `sort` array, also ordered in the same way. +In this case, you are requesting results starting with the document that comes after `line_id = 3202` and `speech_number = 8`: + +```json +GET shakespeare/_search +{ + "query": { + "term": { + "play_name": { + "value": "Henry IV" + } + } + }, + "sort": [ + { + "line_id": { + "order": "desc" + } + }, + { + "speech_number": { + "order": "desc" + } + } + ], + "search_after": [ + "3202", + "8" + ] +} +``` + +## Sort mode + +The sort mode is applicable to sorting by array or multivalued fields. It specifies what array value should be chosen for sorting the document. For numeric fields that contain an array of numbers, you can sort by the `avg`, `sum`, or `median` modes. To sort by the minimum or maximum values, use the `min` or `max` modes that work for both numeric and string data types. + +The default mode is `min` for ascending sort order and `max` for descending sort order. + +The following example illustrates sorting by an array field using the sort mode. + +Consider an index that holds student grades. Index two documents into the index: + +```json +PUT students/_doc/1 +{ + "name": "John Doe", + "grades": [70, 90] +} + +PUT students/_doc/2 +{ + "name": "Mary Major", + "grades": [80, 100] +} +``` + +Sort all students by highest grade average using the `avg` mode: + +``` +GET students/_search +{ + "query" : { + "match_all": {} + }, + "sort" : [ + {"grades" : {"order" : "desc", "mode" : "avg"}} + ] +} +``` + +The response contains students sorted by `grades` in descending order: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ + { + "_index" : "students", + "_id" : "2", + "_score" : null, + "_source" : { + "name" : "Mary Major", + "grades" : [ + 80, + 100 + ] + }, + "sort" : [ + 90 + ] + }, + { + "_index" : "students", + "_id" : "1", + "_score" : null, + "_source" : { + "name" : "John Doe", + "grades" : [ + 70, + 90 + ] + }, + "sort" : [ + 80 + ] + } + ] + } +} +``` + +## Sorting nested objects + +When sorting [nested]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/nested) objects, provide the `path` parameter specifying the path to the field on which to sort. + +For example, in the index `students`, map the variable `first_sem` as `nested`: + +```json +PUT students +{ + "mappings" : { + "properties": { + "first_sem": { + "type" : "nested" + } + } + } +} +``` + +Index two documents with nested fields: + +```json +PUT students/_doc/1 +{ + "name": "John Doe", + "first_sem" : { + "grades": [70, 90] + } +} + +PUT students/_doc/2 +{ + "name": "Mary Major", + "first_sem": { + "grades": [80, 100] + } +} +``` + +When sorting by grade average, provide the path to the nested field: + +```json +GET students/_search +{ + "query" : { + "match_all": {} + }, + "sort" : [ + {"first_sem.grades": { + "order" : "desc", + "mode" : "avg", + "nested": { + "path": "first_sem" + } + } + } + ] +} +``` + +## Handling missing values + +The `missing` parameter specifies the handling of missing values. The built-in valid values are `_last` (list the documents with the missing value last) and `_first` (list the documents with the missing value first). The default value is `_last`. You can also specify a custom value to be used for missing documents as the sort value. + +For example, you can index a document with an `average` field and another document without an `average` field: + +```json +PUT students/_doc/1 +{ + "name": "John Doe", + "average": 80 +} + +PUT students/_doc/2 +{ + "name": "Mary Major" +} +``` + +Sort the documents, ordering the document with a missing field first: + +```json +GET students/_search +{ + "query": { + "match_all": {} + }, + "sort": [ + { + "average": { + "order": "desc", + "missing": "_first" + } + } + ] +} +``` + +The response lists document 2 first: + +```json +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ + { + "_index" : "students", + "_id" : "2", + "_score" : null, + "_source" : { + "name" : "Mary Major" + }, + "sort" : [ + 9223372036854775807 + ] + }, + { + "_index" : "students", + "_id" : "1", + "_score" : null, + "_source" : { + "name" : "John Doe", + "average" : 80 + }, + "sort" : [ + 80 + ] + } + ] + } +} +``` + +## Ignoring unmapped fields + +If a field is not mapped, a search request that sorts by this field fails by default. To avoid this, you can use the `unmapped_type` parameter, which signals to OpenSearch to ignore the field. For example, if you set `unmapped_type` to `long`, the field is treated as if it were mapped as type `long`. Additionally, all documents in the index that have an `unmapped_type` field are treated as if they had no value in this field, so they are not sorted by it. + +For example, consider two indexes. Index a document that contains an `average` field in the first index: + +```json +PUT students/_doc/1 +{ + "name": "John Doe", + "average": 80 +} +``` + +Index a document that does not contain an `average` field in the second index: + +```json +PUT students_no_map/_doc/2 +{ + "name": "Mary Major" +} +``` + +Search for all documents in both indexes and sort them by the `average` field: + +```json +GET students*/_search +{ + "query": { + "match_all": {} + }, + "sort": [ + { + "average": { + "order": "desc" + } + } + ] +} +``` + +By default, the second index produces an error because the `average` field is not mapped: + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 2, + "successful" : 1, + "skipped" : 0, + "failed" : 1, + "failures" : [ + { + "shard" : 0, + "index" : "students_no_map", + "node" : "cam9NWqVSV-jUIkQ3tRubw", + "reason" : { + "type" : "query_shard_exception", + "reason" : "No mapping found for [average] in order to sort on", + "index" : "students_no_map", + "index_uuid" : "JgfRkypKSUSpyU-ZXr9kKA" + } + } + ] + }, + "hits" : { + "total" : { + "value" : 1, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ + { + "_index" : "students", + "_id" : "1", + "_score" : null, + "_source" : { + "name" : "John Doe", + "average" : 80 + }, + "sort" : [ + 80 + ] + } + ] + } +} +``` + +You can specify the `unmapped_type` parameter so that the unmapped field is ignored: + +```json +GET students*/_search +{ + "query": { + "match_all": {} + }, + "sort": [ + { + "average": { + "order": "desc", + "unmapped_type": "long" + } + } + ] +} +``` + +The response contains both documents: + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 2, + "successful" : 2, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ + { + "_index" : "students", + "_id" : "1", + "_score" : null, + "_source" : { + "name" : "John Doe", + "average" : 80 + }, + "sort" : [ + 80 + ] + }, + { + "_index" : "students_no_map", + "_id" : "2", + "_score" : null, + "_source" : { + "name" : "Mary Major" + }, + "sort" : [ + -9223372036854775808 + ] + } + ] + } +} +``` + +## Tracking scores + +By default, scores are not computed when sorting on a field. You can set `track_scores` to `true` to compute and track scores: + +```json +GET students/_search +{ + "query": { + "match_all": {} + }, + "sort": [ + { + "average": { + "order": "desc" + } + } + ], + "track_scores": true +} +``` + +## Sorting by geo distance + +You can sort documents by `_geo_distance`. The following parameters are supported. + +Parameter | Description +:--- | :--- +distance_type | Specifies the method of computing the distance. Valid values are `arc` and `plane`. The `plane` method is faster but less accurate for long distances or close to the poles. Default is `arc`. +mode | Specifies how to handle a field with several geopoints. By default, documents are sorted by the shortest distance when the sort order is ascending and by the longest distance when the sort order is descending. Valid values are `min`, `max`, `median`, and `avg`. +unit | Specifies the units used to compute sort values. Default is meters (`m`). +ignore_unmapped | Specifies how to treat an unmapped field. Set `ignore_unmapped` to `true` to ignore unmapped fields. Default is `false` (produce an error when encountering an unmapped field). + +The `_geo_distance` parameter does not support `missing_values`. The distance is always considered to be `infinity` when a document does not contain the field used for computing distance. +{: .note} + +For example, index two documents with geopoints: + +```json +PUT testindex1/_doc/1 +{ + "point": [74.00, 40.71] +} + +PUT testindex1/_doc/2 +{ + "point": [73.77, -69.63] +} +``` + +Search for all documents and sort them by the distance from the provided point: + +```json +GET testindex1/_search +{ + "sort": [ + { + "_geo_distance": { + "point": [59, -54], + "order": "asc", + "unit": "km", + "distance_type": "arc", + "mode": "min", + "ignore_unmapped": true + } + } + ], + "query": { + "match_all": {} + } +} +``` + +The response contains the sorted documents: + +```json +{ + "took" : 864, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ + { + "_index" : "testindex1", + "_id" : "2", + "_score" : null, + "_source" : { + "point" : [ + 73.77, + -69.63 + ] + }, + "sort" : [ + 1891.2667493895767 + ] + }, + { + "_index" : "testindex1", + "_id" : "1", + "_score" : null, + "_source" : { + "point" : [ + 74.0, + 40.71 + ] + }, + "sort" : [ + 10628.402240213345 + ] + } + ] + } +} +``` + +You can provide coordinates in any format supported by the geopoint field type. For a description of all formats, see the [geopoint field type documentation]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point). +{: .note} + +To pass multiple geopoints to `_geo_distance`, use an array: + +```json +GET testindex1/_search +{ + "sort": [ + { + "_geo_distance": { + "point": [[59, -54], [60, -53]], + "order": "asc", + "unit": "km", + "distance_type": "arc", + "mode": "min", + "ignore_unmapped": true + } + } + ], + "query": { + "match_all": {} + } +} +``` + +For each document, the sorting distance is calculated as the minimum, maximum, or average (as specified by the `mode`) of the distances from all points provided in the search to all points in the document. + +## Performance considerations + +Sorted field values are loaded into memory for sorting. Therefore, for minimum overhead we recommend mapping [numeric types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric) to the smallest acceptable types, like `short`, `integer`, and `float`. [String types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/string) should not have the sorted field analyzed or tokenized. \ No newline at end of file diff --git a/_opensearch/supported-field-types/completion.md b/_opensearch/supported-field-types/completion.md index 7675737ee1..29d239ef30 100644 --- a/_opensearch/supported-field-types/completion.md +++ b/_opensearch/supported-field-types/completion.md @@ -49,7 +49,7 @@ The following table lists the parameters accepted by completion fields. Parameter | Description :--- | :--- -`input` | A list of possible completions as a string or array of strings. Cannot contain `\u0000` (null), `\u001f` (information separator one), or `u001e` (information separator two). Required. +`input` | A list of possible completions as a string or array of strings. Cannot contain `\u0000` (null), `\u001f` (information separator one), or `\u001e` (information separator two). Required. `weight` | A positive integer or a positive integer string for ranking suggestions. Optional. Multiple suggestions can be indexed as follows: diff --git a/_opensearch/supported-field-types/nested.md b/_opensearch/supported-field-types/nested.md index 51ce5e8941..84f29e61c2 100644 --- a/_opensearch/supported-field-types/nested.md +++ b/_opensearch/supported-field-types/nested.md @@ -27,7 +27,7 @@ PUT testindex1/_doc/100 } ``` -When these objects are stored, they are flattened, so their internal represenation has an array of all values for each field: +When these objects are stored, they are flattened, so their internal representation has an array of all values for each field: ```json { @@ -142,13 +142,13 @@ Nested objects are stored as separate documents, and the parent object has refer ```json PUT testindex1 { - "mappings" : { - "properties": { - "patients": { - "type" : "nested" - } - } + "mappings" : { + "properties": { + "patients": { + "type" : "nested" + } } + } } ``` diff --git a/_opensearch/ux.md b/_opensearch/ux.md deleted file mode 100644 index 65cc50a759..0000000000 --- a/_opensearch/ux.md +++ /dev/null @@ -1,1069 +0,0 @@ ---- -layout: default -title: Search experience -nav_order: 55 ---- - -# Search experience - -Expectations from search engines have evolved over the years. Just returning relevant results quickly is no longer enough for most users. OpenSearch includes many features that enhance the user’s search experience as follows: - -Feature | Description -:--- | :--- -Autocomplete queries | Suggest phrases as the user types. -Paginate results | Rather than a single, long list, break search results into pages. -Scroll search | Return a large number of results in batches. -Sort results | Allow sorting results by different criteria. -Highlight query matches | Highlight the search term in the results. - ---- - -## Autocomplete queries - -Autocomplete shows suggestions to users while they type. - -For example, if a user types "pop," OpenSearch provides suggestions like "popcorn" or "popsicles." These suggestions preempt your user's intention and lead them to a possible search term more quickly. - -OpenSearch lets you design autocomplete that updates with each keystroke, provides a few relevant suggestions, and tolerates typos. - -Implement autocomplete using one of three methods: - -- Prefix matching -- Edge N-gram matching -- Completion suggesters - -These methods are described in the following sections. - -### Prefix matching - -Prefix matching finds documents that matches the last term in the query string. - -For example, assume that the user types “qui” into a search UI. To autocomplete this phrase, use the `match_phrase_prefix` query to search all `text_entry` fields that begin with the prefix "qui." -To make the word order and relative positions flexible, specify a `slop` value. To learn about the `slop` option, see the [Multi-match query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text#multi-match). - -#### Sample Request - -```json -GET shakespeare/_search -{ - "query": { - "match_phrase_prefix": { - "text_entry": { - "query": "qui", - "slop": 3 - } - } - } -} -``` - -Prefix matching doesn’t require any special mappings. It works with your data as-is. -However, it’s a fairly resource-intensive operation. A prefix of `a` could match hundreds of thousands of terms and not be useful to your user. - -To limit the impact of prefix expansion, set `max_expansions` to a reasonable number. To learn about the `max_expansions` option, see [Advanced filter options]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text#advanced-filter-options). - -#### Sample Request - -```json -GET shakespeare/_search -{ - "query": { - "match_phrase_prefix": { - "text_entry": { - "query": "qui", - "slop": 3, - "max_expansions": 10 - } - } - } -} -``` - -The ease of implementing query-time autocomplete comes at the cost of performance. -When implementing this feature on a large scale, we recommend an index-time solution. With an index-time solution, you might experience slower indexing, but it’s a price you pay only once and not for every query. The edge N-gram and completion suggester methods are index time. - -### Edge N-gram matching - -During indexing, edge N-grams chop up a word into a sequence of N characters to support a faster lookup of partial search terms. - -If you N-gram the word "quick," the results depend on the value of N. - -N | Type | N-gram -:--- | :--- | :--- -1 | Unigram | [ `q`, `u`, `i`, `c`, `k` ] -2 | Bigram | [ `qu`, `ui`, `ic`, `ck` ] -3 | Trigram | [ `qui`, `uic`, `ick` ] -4 | Four-gram | [ `quic`, `uick` ] -5 | Five-gram | [ `quick` ] - -Autocomplete needs only the beginning N-grams of a search phrase, so OpenSearch uses a special type of N-gram called edge N-gram. - -Edge N-gramming the word "quick" results in the following: - -- `q` -- `qu` -- `qui` -- `quic` -- `quick` - -This follows the same sequence the user types. - -To configure a field to use edge N-grams, create an autocomplete analyzer with an `edge_ngram` filter: - -#### Sample Request - -```json -PUT shakespeare -{ - "mappings": { - "properties": { - "text_entry": { - "type": "text", - "analyzer": "autocomplete" - } - } - }, - "settings": { - "analysis": { - "filter": { - "edge_ngram_filter": { - "type": "edge_ngram", - "min_gram": 1, - "max_gram": 20 - } - }, - "analyzer": { - "autocomplete": { - "type": "custom", - "tokenizer": "standard", - "filter": [ - "lowercase", - "edge_ngram_filter" - ] - } - } - } - } -} -``` - -This example creates the index and instantiates the edge N-gram filter and analyzer. - -The `edge_ngram_filter` produces edge N-grams with a minimum N-gram length of 1 (a single letter) and a maximum length of 20. So it offers suggestions for words of up to 20 letters. - -The `autocomplete` analyzer tokenizes a string into individual terms, lowercases the terms, and then produces edge N-grams for each term using the `edge_ngram_filter`. - -Use the `analyze` operation to test this analyzer: - -```json -POST shakespeare/_analyze -{ - "analyzer": "autocomplete", - "text": "quick" -} -``` - -It returns edge N-grams as tokens: - -* `q` -* `qu` -* `qui` -* `quic` -* `quick` - -Use the `standard` analyzer at search time. Otherwise, the search query splits into edge N-grams and you get results for everything that matches `q`, `u`, and `i`. -This is one of the few occasions where you use a different analyzer on the index and query side. - -#### Sample Request - -```json -GET shakespeare/_search -{ - "query": { - "match": { - "text_entry": { - "query": "qui", - "analyzer": "standard" - } - } - } -} -``` - -#### Sample Response - -```json -{ - "took": 5, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 533, - "relation": "eq" - }, - "max_score": 9.712725, - "hits": [ - { - "_index": "shakespeare", - "_id": "22006", - "_score": 9.712725, - "_source": { - "type": "line", - "line_id": 22007, - "play_name": "Antony and Cleopatra", - "speech_number": 12, - "line_number": "5.2.44", - "speaker": "CLEOPATRA", - "text_entry": "Quick, quick, good hands." - } - }, - { - "_index": "shakespeare", - "_id": "54665", - "_score": 9.712725, - "_source": { - "type": "line", - "line_id": 54666, - "play_name": "Loves Labours Lost", - "speech_number": 21, - "line_number": "5.1.52", - "speaker": "HOLOFERNES", - "text_entry": "Quis, quis, thou consonant?" - } - } - ] - } -} -``` - -Alternatively, specify the `search_analyzer` in the mapping itself: - -```json -"mappings": { - "properties": { - "text_entry": { - "type": "text", - "analyzer": "autocomplete", - "search_analyzer": "standard" - } - } -} -``` - -### Completion suggester - -The completion suggester accepts a list of suggestions and builds them into a finite-state transducer (FST), an optimized data structure that’s essentially a graph. This data structure lives in memory and is optimized for fast prefix lookups. To learn more about FSTs, see [Wikipedia](https://en.wikipedia.org/wiki/Finite-state_transducer). - -As the user types, the completion suggester moves through the FST graph one character at a time along a matching path. After it runs out of user input, it examines the remaining endings to produce a list of suggestions. - -The completion suggester makes your autocomplete solution as efficient as possible and lets you have explicit control over its suggestions. - -Use a dedicated field type called `completion`, which stores the FST-like data structures in the index: - -```json -PUT shakespeare -{ - "mappings": { - "properties": { - "text_entry": { - "type": "completion" - } - } - } -} -``` - -To get back suggestions, use the `search` endpoint with the `suggest` parameter: - -```json -GET shakespeare/_search -{ - "suggest": { - "autocomplete": { - "prefix": "To be", - "completion": { - "field": "text_entry" - } - } - } -} -``` - -The phrase "to be" is prefix matched with the FST of the `text_entry` field. - -#### Sample Response - -```json -{ - "took": 9, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "text_entry": [ - { - "text": "To be", - "offset": 0, - "length": 5, - "options": [ - { - "text": "To be a comrade with the wolf and owl,--", - "_index": "shakespeare", - "_id": "50652", - "_score": 1, - "_source": { - "type": "line", - "line_id": 50653, - "play_name": "King Lear", - "speech_number": 68, - "line_number": "2.4.230", - "speaker": "KING LEAR", - "text_entry": "To be a comrade with the wolf and owl,--" - } - }, - { - "text": "To be a make-peace shall become my age:", - "_index": "shakespeare", - "_id": "78566", - "_score": 1, - "_source": { - "type": "line", - "line_id": 78567, - "play_name": "Richard II", - "speech_number": 20, - "line_number": "1.1.160", - "speaker": "JOHN OF GAUNT", - "text_entry": "To be a make-peace shall become my age:" - } - } - ] - } - ] - } -} -``` - -To specify the number of suggestions that you want to return, use the `size` parameter: - -```json -GET shakespeare/_search -{ - "suggest": { - "autocomplete": { - "prefix": "To m", - "completion": { - "field": "text_entry", - "size": 3 - } - } - } -} -``` - -#### Sample Response - -```json -{ - "took": 3, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "text_entry": [ - { - "text": "To m", - "offset": 0, - "length": 5, - "options": [ - { - "text": "To make a bastard and a slave of me!", - "_index": "shakespeare", - "_id": "5369", - "_score": 4, - "_source": { - "type": "line", - "line_id": 5370, - "play_name": "Henry VI Part 1", - "speech_number": 2, - "line_number": "4.5.15", - "speaker": "JOHN TALBOT", - "text_entry": "To make a bastard and a slave of me!" - } - }, - { - "text": "To make a bloody supper in the Tower.", - "_index": "shakespeare", - "_id": "12504", - "_score": 4, - "_source": { - "type": "line", - "line_id": 12505, - "play_name": "Henry VI Part 3", - "speech_number": 40, - "line_number": "5.5.85", - "speaker": "CLARENCE", - "text_entry": "To make a bloody supper in the Tower." - } - } - ] - } - ] - } -} -``` - -The `suggest` parameter finds suggestions using only prefix matching. -For example, you don't get back "To be, or not to be," which you might want as a suggestion. -To work around this issue, manually add curated suggestions and add weights to prioritize your suggestions. - -Index a document with an input suggestion and assign a weight: - -```json -PUT shakespeare/_doc/1 -{ - "text": "To m", - "text_entry": { - "input": [ - "To be, or not to be: that is the question:" - ], - "weight": 10 - } -} -``` - -Perform the same search as before: - -```json -GET shakespeare/_search -{ - "suggest": { - "autocomplete": { - "prefix": "To m", - "completion": { - "field": "text_entry", - "size": 3 - } - } - } -} -``` - -You see the indexed document as the first result: - -```json -{ - "took": 1021, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "autocomplete": [ - { - "text": "To m", - "offset": 0, - "length": 5, - "options": [ - { - "text": "To be, or not to be: that is the question:", - "_index": "shakespeare", - "_id": "1", - "_score": 30, - "_source": { - "text": "To me", - "text_entry": { - "input": [ - "To be, or not to be: that is the question:" - ], - "weight": 10 - } - } - }, - { - "text": "To make a bastard and a slave of me!", - "_index": "shakespeare", - "_id": "5369", - "_score": 4, - "_source": { - "type": "line", - "line_id": 5370, - "play_name": "Henry VI Part 1", - "speech_number": 2, - "line_number": "4.5.15", - "speaker": "JOHN TALBOT", - "text_entry": "To make a bastard and a slave of me!" - } - } - ] - } - ] - } -} -``` - -Use the `term` suggester to suggest corrected spellings for individual words. -The `term` suggester uses an edit distance to compute suggestions. Edit distance is the number of characters that need to be changed for a term to match. - -In this example, the user misspells a search term: - -```json -GET shakespeare/_search -{ - "suggest": { - "spell-check": { - "text": "lief", - "term": { - "field": "text_entry" - } - } - } -} -``` - -The `term` suggester returns a list of corrections: - -```json -{ - "took": 48, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "spell-check": [ - { - "text": "lief", - "offset": 0, - "length": 4, - "options": [ - { - "text": "lifes", - "score": 0.8, - "freq": 21 - }, - { - "text": "life", - "score": 0.75, - "freq": 805 - }, - { - "text": "lives", - "score": 0.6, - "freq": 187 - }, - { - "text": "liege", - "score": 0.6, - "freq": 138 - }, - { - "text": "lived", - "score": 0.6, - "freq": 80 - } - ] - } - ] - } -} -``` - -The higher the score, the better the suggestion is. The frequency represents the number of times the term appears in the documents of that index. - -To implement a "Did you mean `suggestion`?" feature, use a `phrase` suggester. -The `phrase` suggester is similar to the `term` suggester, except that it uses N-gram language models to suggest whole phrases instead of individual words. - -Create a custom analyzer called `trigram` that uses a `shingle` filter. This filter is similar to the `edge_ngram` filter, but it applies to words instead of letters: - -```json -PUT shakespeare -{ - "settings": { - "index": { - "analysis": { - "analyzer": { - "trigram": { - "type": "custom", - "tokenizer": "standard", - "filter": [ - "lowercase", - "shingle" - ] - } - }, - "filter": { - "shingle": { - "type": "shingle", - "min_shingle_size": 2, - "max_shingle_size": 3 - } - } - } - } - }, - "mappings": { - "properties": { - "text_entry": { - "type": "text", - "fields": { - "trigram": { - "type": "text", - "analyzer": "trigram" - } - } - } - } - } -} -``` - -This example includes as incorrect phrase: - -```json -POST shakespeare/_search -{ - "suggest": { - "text": "That the qution", - "simple_phrase": { - "phrase": { - "field": "text_entry.trigram" - } - } - } -} -``` - -You get back the corrected phrase: - -```json -{ - "took": 3, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "simple_phrase": [ - { - "text": "That the qution", - "offset": 0, - "length": 18, - "options": [ - { - "text": "that is the question", - "score": 0.0015543294 - } - ] - } - ] - } -} -``` - - -## Paginate results - -The `from` and `size` parameters return results to your users one page at a time. - -The `from` parameter is the document number that you want to start showing the results from. The `size` parameter is the number of results that you want to show. Together, they let you return a subset of the search results. - -For example, if the value of `size` is 10 and the value of `from` is 0, you see the first 10 results. If you change the value of `from` to 10, you see the next 10 results (because the results are zero-indexed). So, if you want to see results starting from result 11, `from` must be 10. - -```json -GET shakespeare/_search -{ - "from": 0, - "size": 10, - "query": { - "match": { - "play_name": "Hamlet" - } - } -} -``` - -To calculate the `from` parameter relative to the page number: - -```json -from = size * (page_number - 1) -``` - -Each time the user chooses the next page of the results, your application needs to make the same search query with an incremented `from` value. - -You can also specify the `from` and `size` parameters in the search URI: - -```json -GET shakespeare/_search?from=0&size=10 -``` - -If you only specify the `size` parameter, the `from` parameter defaults to 0. - -Querying for pages deep in your results can have a significant performance impact, so OpenSearch limits this approach to 10,000 results. - -The `from` and `size` parameters are stateless, so the results are based on the latest available data. -This can cause inconsistent pagination. -For example, assume a user stays on the first page of the results for a minute and then navigates to the second page; in that time, a new document is indexed in the background which is relevant enough to show up on the first page. In this scenario, the last result of the first page is pushed to the second page, so the user ends up seeing a result on the second page that they already saw on the first page. - -Use the `scroll` operation for consistent pagination. The `scroll` operation keeps a search context open for a certain period of time. Any data changes do not affect the results during this time. - -## Scroll search - -The `from` and `size` parameters allow you to paginate your search results, but with a limit of 10,000 results at a time. - -If you need to request massive volumes of data from, for example, a machine learning job, use the `scroll` operation instead. The `scroll` operation allows you to request an unlimited number of results. - -To use the scroll operation, add a `scroll` parameter to the request header with a search context to tell OpenSearch how long you need to keep scrolling. This search context needs to be long enough to process a single batch of results. - -To set the number of results that you want returned for each batch, use the `size` parameter: - -```json -GET shakespeare/_search?scroll=10m -{ - "size": 10000 -} -``` - -OpenSearch caches the results and returns a scroll ID to access them in batches: - -```json -"_scroll_id" : "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAUWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ==" -``` - -Pass this scroll ID to the `scroll` operation to get back the next batch of results: - -```json -GET _search/scroll -{ - "scroll": "10m", - "scroll_id": "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAUWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ==" -} -``` - -Using this scroll ID, you get results in batches of 10,000 as long as the search context is still open. Typically, the scroll ID does not change between requests, but it *can* change, so make sure to always use the latest scroll ID. If you don't send the next scroll request within the set search context, the `scroll` operation does not return any results. - -If you expect billions of results, use a sliced scroll. Slicing allows you to perform multiple scroll operations for the same request, but in parallel. -Set the ID and the maximum number of slices for the scroll: - -```json -GET shakespeare/_search?scroll=10m -{ - "slice": { - "id": 0, - "max": 10 - }, - "query": { - "match_all": {} - } -} -``` - -With a single scroll ID, you get back 10 results. -You can have up to 10 IDs. -Perform the same command with ID equal to 1: - -```json -GET shakespeare/_search?scroll=10m -{ - "slice": { - "id": 1, - "max": 10 - }, - "query": { - "match_all": {} - } -} -``` - -Close the search context when you’re done scrolling, because it continues to consume computing resources until the timeout: - -```json -DELETE _search/scroll/DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAcWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ== -``` - -#### Sample Response - -```json -{ - "succeeded": true, - "num_freed": 1 -} -``` - -To close all open scroll contexts: - -```json -DELETE _search/scroll/_all -``` - -The `scroll` operation corresponds to a specific timestamp. It doesn't consider documents added after that timestamp as potential results. - -Because open search contexts consume a lot of memory, we suggest you don't use the `scroll` operation for frequent user queries that don't need the search context open. Instead, use the `sort` parameter with the `search_after` parameter to scroll responses for user queries. - -## Sort results - -Sorting allows your users to sort the results in a way that’s most meaningful to them. - -By default, full-text queries sort results by the relevance score. -You can choose to sort the results by any field value in either ascending or descending order. - -For example, to sort results by descending order of a `line_id` value: - -```json -GET shakespeare/_search -{ - "query": { - "term": { - "play_name": { - "value": "Henry IV" - } - } - }, - "sort": [ - { - "line_id": { - "order": "desc" - } - } - ] -} -``` - -The sort parameter is an array, so you can specify multiple field values in the order of their priority. - -If you have two fields with the same value for `line_id`, OpenSearch uses `speech_number`, which is the second option for sorting. - -```json -GET shakespeare/_search -{ - "query": { - "term": { - "play_name": { - "value": "Henry IV" - } - } - }, - "sort": [ - { - "line_id": { - "order": "desc" - } - }, - { - "speech_number": { - "order": "desc" - } - } - ] -} -``` - -You can continue to sort by any number of field values to get the results in just the right order. It doesn’t have to be a numerical value---you can also sort by date or timestamp fields: - -```json -"sort": [ - { - "date": { - "order": "desc" - } - } - ] -``` - -For numeric fields that contain an array of numbers, you can sort by `avg`, `sum`, and `median` modes: - -```json -"sort": [ - { - "price": { - "order": "asc", - "mode": "avg" - } - } -] -``` - -To sort by the minimum or maximum values, use the `min` or `max` modes. These modes work for both numeric and string data types. - -A text field that’s analyzed cannot be used to sort documents, because the inverted index only contains the individual tokenized terms and not the entire string. So, you cannot sort by the `play_name`, for example. - -One workaround is map a raw version of the text field as a keyword type, so it won’t be analyzed and you have a copy of the full original version for sorting purposes. - -```json -GET shakespeare/_search -{ - "query": { - "term": { - "play_name": { - "value": "Henry IV" - } - } - }, - "sort": [ - { - "play_name.keyword": { - "order": "desc" - } - } - ] -} -``` - -You get back results sorted by the `play_name` field in alphabetic order. - -Use `sort` with `search_after` parameter for more efficient scrolling. -You get back results after the values you specify in the `search_after` array. - -Make sure you have the same number of values in the `search_after` array as in the `sort` array, also ordered in the same way. -In this case, you get back results after `line_id = 3202` and `speech_number = 8`. - -```json -GET shakespeare/_search -{ - "query": { - "term": { - "play_name": { - "value": "Henry IV" - } - } - }, - "sort": [ - { - "line_id": { - "order": "desc" - } - }, - { - "speech_number": { - "order": "desc" - } - } - ], - "search_after": [ - "3202", - "8" - ] -} -``` - -## Highlight query matches - -Highlighting emphasizes the search term(s) in the results. - -To highlight the search terms, add a `highlight` parameter outside of the query block: - -```json -GET shakespeare/_search -{ - "query": { - "match": { - "text_entry": "life" - } - }, - "highlight": { - "fields": { - "text_entry": {} - } - } -} -``` - -For each document in the results, you get back a `highlight` object that shows your search term wrapped in an `em` tag: - -```json -"highlight": { - "text_entry": [ - "my life, except my life." - ] -} -``` - -Design your application code to parse the results from the `highlight` object and perform some action on the search terms, such as changing their color, bolding, italicizing, and so on. - -To change the default `em` tags, use the `pretag` and `posttag` parameters: - -```json -GET shakespeare/_search?format=yaml -{ - "query": { - "match": { - "play_name": "Henry IV" - } - }, - "highlight": { - "pre_tags": [ - "" - ], - "post_tags": [ - "" - ], - "fields": { - "play_name": {} - } - } -} -``` - -The `highlight` parameter highlights the original terms even when using synonyms or stemming for the search itself.