From a000c66a83ee39e9f9f07cd984b01b3569c8799c Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Tue, 6 Aug 2024 12:02:47 +0100 Subject: [PATCH 1/7] adding classic token filter docs #7876 Signed-off-by: AntonEliatra --- _analyzers/token-filters/classic.md | 94 +++++++++++++++++++++++++++++ _analyzers/token-filters/index.md | 2 +- 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 _analyzers/token-filters/classic.md diff --git a/_analyzers/token-filters/classic.md b/_analyzers/token-filters/classic.md new file mode 100644 index 0000000000..8482f23cde --- /dev/null +++ b/_analyzers/token-filters/classic.md @@ -0,0 +1,94 @@ +--- +layout: default +title: classic +parent: Token filters +nav_order: 150 +--- + +# Classic token filter + +The primary function of the `classic` token filter is to work along side `classic` tokenizer and process tokens by applying several common transformations that help in text analysis and search. The transformations include: + - Removal of possessive endings such as "'s", for example: "John's" becomes "John". + - Separating words on internal hyphens, making terms like "co-operate" become tokens "co" and "operate". + - Removal of "." from acronyms, for example: "D.A.R.P.A." becomes "DARPA". + + +## Example + +Following is an example of how you can define an analyzer with the `classic` filter: + +```json +PUT /custom_classic_filter +{ + "settings": { + "analysis": { + "analyzer": { + "custom_classic": { + "type": "custom", + "tokenizer": "classic", + "filter": ["classic"] + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Generated tokens + +You can use the following command to examine the tokens being generated using the created analyzer: + +```json +POST /custom_classic_filter/_analyze +{ + "analyzer": "custom_classic", + "text": "John's co-operate was excellent." +} +``` +{% include copy-curl.html %} + +The response contains the generated tokens: + +```json +{ + "tokens": [ + { + "token": "John", + "start_offset": 0, + "end_offset": 6, + "type": "", + "position": 0 + }, + { + "token": "co", + "start_offset": 7, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "operate", + "start_offset": 10, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "was", + "start_offset": 18, + "end_offset": 21, + "type": "", + "position": 3 + }, + { + "token": "excellent", + "start_offset": 22, + "end_offset": 31, + "type": "", + "position": 4 + } + ] +} +``` + diff --git a/_analyzers/token-filters/index.md b/_analyzers/token-filters/index.md index e6d9875736..06a01e4579 100644 --- a/_analyzers/token-filters/index.md +++ b/_analyzers/token-filters/index.md @@ -17,7 +17,7 @@ Token filter | Underlying Lucene token filter| Description `asciifolding` | [ASCIIFoldingFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html) | Converts alphabetic, numeric, and symbolic characters. `cjk_bigram` | [CJKBigramFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html) | Forms bigrams of Chinese, Japanese, and Korean (CJK) tokens. `cjk_width` | [CJKWidthFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html) | Normalizes Chinese, Japanese, and Korean (CJK) tokens according to the following rules:
- Folds full-width ASCII character variants into the equivalent basic Latin characters.
- Folds half-width Katakana character variants into the equivalent Kana characters. -`classic` | [ClassicFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/classic/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. +[`classic`]({{site.url}}{{site.baseurl}}/analyzers/token-filters/classic) | [ClassicFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/classic/ClassicFilter.html) | Performs optional post-processing on the tokens generated by the classic tokenizer. Removes possessives (`'s`) and removes `.` from acronyms. `common_grams` | [CommonGramsFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html) | Generates bigrams for a list of frequently occurring terms. The output contains both single terms and bigrams. `conditional` | [ConditionalTokenFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.html) | Applies an ordered list of token filters to tokens that match the conditions provided in a script. `decimal_digit` | [DecimalDigitFilter](https://lucene.apache.org/core/9_10_0/analysis/common/org/apache/lucene/analysis/core/DecimalDigitFilter.html) | Converts all digits in the Unicode decimal number general category to basic Latin digits (0--9). From 191dbd11d7a1732265be391cd8f733840195df54 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Tue, 6 Aug 2024 17:52:09 +0100 Subject: [PATCH 2/7] Updating details as per comments Signed-off-by: AntonEliatra --- _analyzers/token-filters/classic.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/_analyzers/token-filters/classic.md b/_analyzers/token-filters/classic.md index 8482f23cde..e0549dd766 100644 --- a/_analyzers/token-filters/classic.md +++ b/_analyzers/token-filters/classic.md @@ -2,20 +2,20 @@ layout: default title: classic parent: Token filters -nav_order: 150 +nav_order: 50 --- # Classic token filter -The primary function of the `classic` token filter is to work along side `classic` tokenizer and process tokens by applying several common transformations that help in text analysis and search. The transformations include: - - Removal of possessive endings such as "'s", for example: "John's" becomes "John". - - Separating words on internal hyphens, making terms like "co-operate" become tokens "co" and "operate". - - Removal of "." from acronyms, for example: "D.A.R.P.A." becomes "DARPA". +The primary function of the classic token filter is to work alongside the classic tokenizer. It processes tokens by applying several common transformations. These transformations aid in text analysis and search. These include: + - Removal of possessive endings such as *'s*, for example: *John's* becomes *John*. + - Separating words on internal hyphens, making terms like *co-operate* become tokens *co* and *operate*. + - Removal of *.* from acronyms, for example: *D.A.R.P.A.* becomes *DARPA*. ## Example -Following is an example of how you can define an analyzer with the `classic` filter: +The following example request creates a new index named `custom_classic_filter` and configures an analyzer with the `classic` filter: ```json PUT /custom_classic_filter @@ -37,7 +37,7 @@ PUT /custom_classic_filter ## Generated tokens -You can use the following command to examine the tokens being generated using the created analyzer: +Use the following request to examine the tokens generated using the created analyzer: ```json POST /custom_classic_filter/_analyze From f5023737cbfffec94e26b4cf26ca421b64426699 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Tue, 6 Aug 2024 17:58:45 +0100 Subject: [PATCH 3/7] Update classic.md Signed-off-by: AntonEliatra --- _analyzers/token-filters/classic.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/token-filters/classic.md b/_analyzers/token-filters/classic.md index e0549dd766..9b65a5eb5d 100644 --- a/_analyzers/token-filters/classic.md +++ b/_analyzers/token-filters/classic.md @@ -10,7 +10,7 @@ nav_order: 50 The primary function of the classic token filter is to work alongside the classic tokenizer. It processes tokens by applying several common transformations. These transformations aid in text analysis and search. These include: - Removal of possessive endings such as *'s*, for example: *John's* becomes *John*. - Separating words on internal hyphens, making terms like *co-operate* become tokens *co* and *operate*. - - Removal of *.* from acronyms, for example: *D.A.R.P.A.* becomes *DARPA*. + - Removal of periods from acronyms, for example: *D.A.R.P.A.* becomes *DARPA*. ## Example From 962e3a830a40fee66ae68ccfa638ed2e32e627b6 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Tue, 3 Sep 2024 16:11:50 +0100 Subject: [PATCH 4/7] Update classic.md Signed-off-by: AntonEliatra --- _analyzers/token-filters/classic.md | 1 - 1 file changed, 1 deletion(-) diff --git a/_analyzers/token-filters/classic.md b/_analyzers/token-filters/classic.md index 9b65a5eb5d..1717596d8e 100644 --- a/_analyzers/token-filters/classic.md +++ b/_analyzers/token-filters/classic.md @@ -9,7 +9,6 @@ nav_order: 50 The primary function of the classic token filter is to work alongside the classic tokenizer. It processes tokens by applying several common transformations. These transformations aid in text analysis and search. These include: - Removal of possessive endings such as *'s*, for example: *John's* becomes *John*. - - Separating words on internal hyphens, making terms like *co-operate* become tokens *co* and *operate*. - Removal of periods from acronyms, for example: *D.A.R.P.A.* becomes *DARPA*. From e8aa75dded3cb06cbd877a4c31ee987d06bdedc2 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 12 Sep 2024 11:09:09 +0100 Subject: [PATCH 5/7] Update classic.md Signed-off-by: AntonEliatra --- _analyzers/token-filters/classic.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/token-filters/classic.md b/_analyzers/token-filters/classic.md index 1717596d8e..46fbd9ea88 100644 --- a/_analyzers/token-filters/classic.md +++ b/_analyzers/token-filters/classic.md @@ -36,7 +36,7 @@ PUT /custom_classic_filter ## Generated tokens -Use the following request to examine the tokens generated using the created analyzer: +Use the following request to examine the tokens generated using the analyzer: ```json POST /custom_classic_filter/_analyze From c51e5ea41993ac100e3e5861103bc19c47ca33a1 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 3 Oct 2024 11:22:30 -0400 Subject: [PATCH 6/7] Update _analyzers/token-filters/classic.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --- _analyzers/token-filters/classic.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_analyzers/token-filters/classic.md b/_analyzers/token-filters/classic.md index 46fbd9ea88..053b7ff925 100644 --- a/_analyzers/token-filters/classic.md +++ b/_analyzers/token-filters/classic.md @@ -7,7 +7,7 @@ nav_order: 50 # Classic token filter -The primary function of the classic token filter is to work alongside the classic tokenizer. It processes tokens by applying several common transformations. These transformations aid in text analysis and search. These include: +The primary function of the classic token filter is to work alongside the classic tokenizer. It processes tokens by applying the following common transformations, which aid in text analysis and search: - Removal of possessive endings such as *'s*, for example: *John's* becomes *John*. - Removal of periods from acronyms, for example: *D.A.R.P.A.* becomes *DARPA*. From 82a44bd4efe96a38c1f33cf998e52c40987d20c1 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 3 Oct 2024 20:06:06 +0100 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower Signed-off-by: AntonEliatra --- _analyzers/token-filters/classic.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_analyzers/token-filters/classic.md b/_analyzers/token-filters/classic.md index 053b7ff925..34db74a824 100644 --- a/_analyzers/token-filters/classic.md +++ b/_analyzers/token-filters/classic.md @@ -1,6 +1,6 @@ --- layout: default -title: classic +title: Classic parent: Token filters nav_order: 50 --- @@ -8,8 +8,8 @@ nav_order: 50 # Classic token filter The primary function of the classic token filter is to work alongside the classic tokenizer. It processes tokens by applying the following common transformations, which aid in text analysis and search: - - Removal of possessive endings such as *'s*, for example: *John's* becomes *John*. - - Removal of periods from acronyms, for example: *D.A.R.P.A.* becomes *DARPA*. + - Removal of possessive endings such as *'s*. For example, *John's* becomes *John*. + - Removal of periods from acronyms. For example, *D.A.R.P.A.* becomes *DARPA*. ## Example