From 38ff49fde9704b4ab0340d076ae747d13658c048 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Mon, 14 Oct 2024 15:35:50 +0100 Subject: [PATCH 1/3] add whitespace analyzer docs Signed-off-by: Anton Rubin --- _analyzers/whitespace.md | 86 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 _analyzers/whitespace.md diff --git a/_analyzers/whitespace.md b/_analyzers/whitespace.md new file mode 100644 index 0000000000..3528d31b8d --- /dev/null +++ b/_analyzers/whitespace.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Whitespace analyzer +nav_order: 60 +--- + +# Whitespace analyzer + +The `whitespace` analyzer breaks text into tokens based solely on whitespace characters (spaces, tabs, etc.). It does not apply any transformations, such as lowercasing or removing stop words, therefore the case of the original text is retained and will include punctuation as part of the tokens. + +## Example configuration + +You can use the following command to create index `my_whitespace_index` with `whitespace` analyzer: + +```json +PUT /my_whitespace_index +{ + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "whitespace" + } + } + } +} +``` +{% include copy-curl.html %} + +## Configuring custom analyzer + +You can use the following command to configure index `my_custom_whitespace_index` with custom analyzer equivalent to `whitespace` analyzer but with added `lowercase` character filter: + +```json +PUT /my_custom_whitespace_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_whitespace_analyzer": { + "type": "custom", + "tokenizer": "whitespace", + "filter": ["lowercase"] + } + } + } + }, + "mappings": { + "properties": { + "my_field": { + "type": "text", + "analyzer": "my_custom_whitespace_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the created analyzer: + +```json +POST /my_custom_whitespace_index/_analyze +{ + "analyzer": "my_custom_whitespace_analyzer", + "text": "The SLOW turtle swims away! 123" +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + {"token": "the","start_offset": 0,"end_offset": 3,"type": "word","position": 0}, + {"token": "slow","start_offset": 4,"end_offset": 8,"type": "word","position": 1}, + {"token": "turtle","start_offset": 9,"end_offset": 15,"type": "word","position": 2}, + {"token": "swims","start_offset": 16,"end_offset": 21,"type": "word","position": 3}, + {"token": "away!","start_offset": 22,"end_offset": 27,"type": "word","position": 4}, + {"token": "123","start_offset": 28,"end_offset": 31,"type": "word","position": 5} + ] +} +``` From 472f738ddfb61494e3a538ac6ea5cf348798e9a9 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Fri, 6 Dec 2024 13:32:59 -0500 Subject: [PATCH 2/3] Doc review Signed-off-by: Fanit Kolchina --- _analyzers/whitespace.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/_analyzers/whitespace.md b/_analyzers/whitespace.md index 3528d31b8d..6c9e9c639b 100644 --- a/_analyzers/whitespace.md +++ b/_analyzers/whitespace.md @@ -6,11 +6,11 @@ nav_order: 60 # Whitespace analyzer -The `whitespace` analyzer breaks text into tokens based solely on whitespace characters (spaces, tabs, etc.). It does not apply any transformations, such as lowercasing or removing stop words, therefore the case of the original text is retained and will include punctuation as part of the tokens. +The `whitespace` analyzer breaks text into tokens based only on whitespace characters (spaces, tabs, and others). It does not apply any transformations, such as lowercasing or removing stopwords, therefore the case of the original text is retained and punctuation is included as part of the tokens. -## Example configuration +## Example -You can use the following command to create index `my_whitespace_index` with `whitespace` analyzer: +Use the following command to create an index named `my_whitespace_index` with a `whitespace` analyzer: ```json PUT /my_whitespace_index @@ -27,9 +27,9 @@ PUT /my_whitespace_index ``` {% include copy-curl.html %} -## Configuring custom analyzer +## Configuring a custom analyzer -You can use the following command to configure index `my_custom_whitespace_index` with custom analyzer equivalent to `whitespace` analyzer but with added `lowercase` character filter: +Use the following command to configure an index with a custom analyzer that is equivalent to a `whitespace` analyzer with an added `lowercase` character filter: ```json PUT /my_custom_whitespace_index @@ -59,7 +59,7 @@ PUT /my_custom_whitespace_index ## Generated tokens -Use the following request to examine the tokens generated using the created analyzer: +Use the following request to examine the tokens generated using the analyzer: ```json POST /my_custom_whitespace_index/_analyze From c6a28e1cef84192f7256c300623d6a0d71c3c665 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Tue, 10 Dec 2024 10:22:19 -0500 Subject: [PATCH 3/3] Update _analyzers/whitespace.md Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _analyzers/whitespace.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/whitespace.md b/_analyzers/whitespace.md index 6c9e9c639b..67fee61295 100644 --- a/_analyzers/whitespace.md +++ b/_analyzers/whitespace.md @@ -6,7 +6,7 @@ nav_order: 60 # Whitespace analyzer -The `whitespace` analyzer breaks text into tokens based only on whitespace characters (spaces, tabs, and others). It does not apply any transformations, such as lowercasing or removing stopwords, therefore the case of the original text is retained and punctuation is included as part of the tokens. +The `whitespace` analyzer breaks text into tokens based only on white space characters (for example, spaces and tabs). It does not apply any transformations, such as lowercasing or removing stopwords, so the original case of the text is retained and punctuation is included as part of the tokens. ## Example