diff --git a/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc index 3b7796bc596fa..33de5d131446b 100644 --- a/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/kstem-tokenfilter.asciidoc @@ -4,6 +4,112 @@ KStem ++++ -The `kstem` token filter is a high performance filter for english. All -terms must already be lowercased (use `lowercase` filter) for this -filter to work correctly. +Provides http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[KStem]-based stemming for +the English language. The `kstem` filter combines +<> with a built-in +<>. + +The `kstem` filter tends to stem less aggressively than other English stemmer +filters, such as the <> filter. + +The `kstem` filter is equivalent to the +<> filter's +<> variant. + +This filter uses Lucene's +{lucene-analysis-docs}s/en/KStemFilter.html[KStemFilter]. + +[[analysis-kstem-tokenfilter-analyze-ex]] +==== Example + +The following analyze API request uses the `kstem` filter to stem `the foxes +jumping quickly` to `the fox jump quick`: + +[source,console] +---- +GET /_analyze +{ + "tokenizer": "standard", + "filter": [ "kstem" ], + "text": "the foxes jumping quickly" +} +---- + +The filter produces the following tokens: + +[source,text] +---- +[ the, fox, jump, quick ] +---- + +//// +[source,console-result] +---- +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "", + "position": 0 + }, + { + "token": "fox", + "start_offset": 4, + "end_offset": 9, + "type": "", + "position": 1 + }, + { + "token": "jump", + "start_offset": 10, + "end_offset": 17, + "type": "", + "position": 2 + }, + { + "token": "quick", + "start_offset": 18, + "end_offset": 25, + "type": "", + "position": 3 + } + ] +} +---- +//// + +[[analysis-kstem-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +`kstem` filter to configure a new <>. + +[IMPORTANT] +==== +To work properly, the `kstem` filter requires lowercase tokens. To ensure tokens +are lowercased, add the <> filter +before the `kstem` filter in the analyzer configuration. +==== + +[source,console] +---- +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "whitespace", + "filter": [ + "lowercase", + "kstem" + ] + } + } + } + } +} +----