From 6c8358e79e03a18f387a9a59db59516fc75b98c4 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 10 Oct 2024 11:28:33 +0100 Subject: [PATCH 1/5] add standard tokenizer docs Signed-off-by: Anton Rubin --- _analyzers/tokenizers/index.md | 2 +- _analyzers/tokenizers/standard.md | 107 ++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 _analyzers/tokenizers/standard.md diff --git a/_analyzers/tokenizers/index.md b/_analyzers/tokenizers/index.md index d401851f60..1abc5ee7ff 100644 --- a/_analyzers/tokenizers/index.md +++ b/_analyzers/tokenizers/index.md @@ -2,7 +2,7 @@ layout: default title: Tokenizers nav_order: 60 -has_children: false +has_children: true has_toc: false --- diff --git a/_analyzers/tokenizers/standard.md b/_analyzers/tokenizers/standard.md new file mode 100644 index 0000000000..d029f1fb12 --- /dev/null +++ b/_analyzers/tokenizers/standard.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Standard +parent: Tokenizers +nav_order: 130 +--- + +# Standard tokenizer + +The `standard` tokenizer is the default tokenizer used in OpenSearch. It tokenizes text based on word boundaries, using a grammar-based approach that recognizes letters, digits, and other characters like punctuation. It is highly versatile and suitable for many languages, as it follows Unicode text segmentation rules ([UAX#29](https://unicode.org/reports/tr29/)) to break text into tokens. + +## Example usage + +The following example request creates a new index named `my_index` and configures an analyzer with `standard` tokenizer: + +```json +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_standard_analyzer": { + "type": "standard" + } + } + } + }, + "mappings": { + "properties": { + "content": { + "type": "text", + "analyzer": "my_standard_analyzer" + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +Use the following request to examine the tokens generated using the created analyzer: + +```json +POST /my_index/_analyze +{ + "analyzer": "my_standard_analyzer", + "text": "OpenSearch is powerful, fast, and scalable." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "opensearch", + "start_offset": 0, + "end_offset": 10, + "type": "", + "position": 0 + }, + { + "token": "is", + "start_offset": 11, + "end_offset": 13, + "type": "", + "position": 1 + }, + { + "token": "powerful", + "start_offset": 14, + "end_offset": 22, + "type": "", + "position": 2 + }, + { + "token": "fast", + "start_offset": 24, + "end_offset": 28, + "type": "", + "position": 3 + }, + { + "token": "and", + "start_offset": 30, + "end_offset": 33, + "type": "", + "position": 4 + }, + { + "token": "scalable", + "start_offset": 34, + "end_offset": 42, + "type": "", + "position": 5 + } + ] +} +``` + +## Configuration + +The `standard` tokenizer can be configured with parameter `max_token_length` which sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at length configured in `max_token_length`. Default is `255` (Integer, _Optional_) + From 13a60c393fbf5ce481098bb6c02754435b273767 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 16 Oct 2024 17:20:08 +0100 Subject: [PATCH 2/5] updating parameter table Signed-off-by: Anton Rubin --- _analyzers/tokenizers/standard.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/standard.md b/_analyzers/tokenizers/standard.md index d029f1fb12..db0b7c0b3c 100644 --- a/_analyzers/tokenizers/standard.md +++ b/_analyzers/tokenizers/standard.md @@ -103,5 +103,9 @@ The response contains the generated tokens: ## Configuration -The `standard` tokenizer can be configured with parameter `max_token_length` which sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at length configured in `max_token_length`. Default is `255` (Integer, _Optional_) +The `standard` tokenizer can be configured with the following parameter. + +Parameter | Required/Optional | Data type | Description +:--- | :--- | :--- | :--- +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at length configured in `max_token_length`. Default is `255`. From 0f34ebf937c35b39bfc01702717dbbe762df89c3 Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Thu, 5 Dec 2024 14:00:16 -0500 Subject: [PATCH 3/5] Doc review Signed-off-by: Fanit Kolchina --- _analyzers/tokenizers/standard.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/_analyzers/tokenizers/standard.md b/_analyzers/tokenizers/standard.md index db0b7c0b3c..946296144a 100644 --- a/_analyzers/tokenizers/standard.md +++ b/_analyzers/tokenizers/standard.md @@ -7,11 +7,11 @@ nav_order: 130 # Standard tokenizer -The `standard` tokenizer is the default tokenizer used in OpenSearch. It tokenizes text based on word boundaries, using a grammar-based approach that recognizes letters, digits, and other characters like punctuation. It is highly versatile and suitable for many languages, as it follows Unicode text segmentation rules ([UAX#29](https://unicode.org/reports/tr29/)) to break text into tokens. +The `standard` tokenizer is the default tokenizer in OpenSearch. It tokenizes text based on word boundaries using a grammar-based approach that recognizes letters, digits, and other characters like punctuation. It is highly versatile and suitable for many languages, because it follows Unicode text segmentation rules ([UAX#29](https://unicode.org/reports/tr29/)) to break text into tokens. ## Example usage -The following example request creates a new index named `my_index` and configures an analyzer with `standard` tokenizer: +The following example request creates a new index named `my_index` and configures an analyzer with a `standard` tokenizer: ```json PUT /my_index @@ -39,7 +39,7 @@ PUT /my_index ## Generated tokens -Use the following request to examine the tokens generated using the created analyzer: +Use the following request to examine the tokens generated using the analyzer: ```json POST /my_index/_analyze @@ -107,5 +107,5 @@ The `standard` tokenizer can be configured with the following parameter. Parameter | Required/Optional | Data type | Description :--- | :--- | :--- | :--- -`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at length configured in `max_token_length`. Default is `255`. +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at length configured in the `max_token_length`. Default is `255`. From 7b81f6ac090dee4a1b3ef715f0dbf50ae8542d23 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 5 Dec 2024 14:02:26 -0500 Subject: [PATCH 4/5] Update _analyzers/tokenizers/standard.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _analyzers/tokenizers/standard.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/tokenizers/standard.md b/_analyzers/tokenizers/standard.md index 946296144a..32946dd6af 100644 --- a/_analyzers/tokenizers/standard.md +++ b/_analyzers/tokenizers/standard.md @@ -101,7 +101,7 @@ The response contains the generated tokens: } ``` -## Configuration +## Parameters The `standard` tokenizer can be configured with the following parameter. From ed9b8073efb979902cc636230d84e0756103b86d Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Mon, 9 Dec 2024 12:24:31 -0500 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _analyzers/tokenizers/standard.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_analyzers/tokenizers/standard.md b/_analyzers/tokenizers/standard.md index 32946dd6af..c10f25802b 100644 --- a/_analyzers/tokenizers/standard.md +++ b/_analyzers/tokenizers/standard.md @@ -7,7 +7,7 @@ nav_order: 130 # Standard tokenizer -The `standard` tokenizer is the default tokenizer in OpenSearch. It tokenizes text based on word boundaries using a grammar-based approach that recognizes letters, digits, and other characters like punctuation. It is highly versatile and suitable for many languages, because it follows Unicode text segmentation rules ([UAX#29](https://unicode.org/reports/tr29/)) to break text into tokens. +The `standard` tokenizer is the default tokenizer in OpenSearch. It tokenizes text based on word boundaries using a grammar-based approach that recognizes letters, digits, and other characters like punctuation. It is highly versatile and suitable for many languages because it uses Unicode text segmentation rules ([UAX#29](https://unicode.org/reports/tr29/)) to break text into tokens. ## Example usage @@ -107,5 +107,5 @@ The `standard` tokenizer can be configured with the following parameter. Parameter | Required/Optional | Data type | Description :--- | :--- | :--- | :--- -`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at length configured in the `max_token_length`. Default is `255`. +`max_token_length` | Optional | Integer | Sets the maximum length of the produced token. If this length is exceeded, the token is split into multiple tokens at the length configured in `max_token_length`. Default is `255`.