From fef3b2b4a7aa6b25ac9bc82dd158e0199cccdf89 Mon Sep 17 00:00:00 2001 From: James Rodewig Date: Mon, 18 Nov 2019 12:44:15 -0500 Subject: [PATCH] [DOCS] Reformat elision token filter docs --- .../tokenfilters/elision-tokenfilter.asciidoc | 176 ++++++++++++++++-- 1 file changed, 163 insertions(+), 13 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc index 91577342f0abc..f3b6fe0cdf4ee 100644 --- a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc @@ -1,15 +1,96 @@ [[analysis-elision-tokenfilter]] -=== Elision Token Filter +=== Elision token filter +++++ +Elision +++++ -A token filter which removes elisions. For example, "l'avion" (the -plane) will tokenized as "avion" (plane). +Removes specified https://en.wikipedia.org/wiki/Elision[elisions] from +the beginning of tokens. For example, you can use this filter to change +`l'avion` to `avion`. -Requires either an `articles` parameter which is a set of stop word articles, or -`articles_path` which points to a text file containing the stop set. Also optionally -accepts `articles_case`, which indicates whether the filter treats those articles as -case sensitive. +When not customized, the filter removes the following French elisions by default: -For example: +`l'`, `m'`, `t'`, `qu'`, `n'`, `s'`, `j'`, `d'`, `c'`, `jusqu'`, `quoiqu'`, +`lorsqu'`, `puisqu'` + +Customized versions of this filter are included in several of {es}'s built-in +<>: + +* <> +* <> +* <> +* <> + +This filter uses Lucene's +https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/util/ElisionFilter.html[ElisionFilter]. + +[[analysis-elision-tokenfilter-analyze-ex]] +==== Example + +The following <> request uses the `elision` +filter to remove `j'` from `j’examine près du wharf`: + +[source,console] +-------------------------------------------------- +GET _analyze +{ + "tokenizer" : "standard", + "filter" : ["elision"], + "text" : "j’examine près du wharf" +} +-------------------------------------------------- + +The filter produces the following tokens: + +[source,text] +-------------------------------------------------- +[ examine, près, du, wharf ] +-------------------------------------------------- + +///////////////////// +[source,console-result] +-------------------------------------------------- +{ + "tokens" : [ + { + "token" : "examine", + "start_offset" : 0, + "end_offset" : 9, + "type" : "", + "position" : 0 + }, + { + "token" : "près", + "start_offset" : 10, + "end_offset" : 14, + "type" : "", + "position" : 1 + }, + { + "token" : "du", + "start_offset" : 15, + "end_offset" : 17, + "type" : "", + "position" : 2 + }, + { + "token" : "wharf", + "start_offset" : 18, + "end_offset" : 23, + "type" : "", + "position" : 3 + } + ] +} +-------------------------------------------------- +///////////////////// + +[[analysis-elision-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +`elision` filter to configure a new +<>. [source,console] -------------------------------------------------- @@ -18,16 +99,85 @@ PUT /elision_example "settings" : { "analysis" : { "analyzer" : { - "default" : { - "tokenizer" : "standard", + "whitespace_elision" : { + "tokenizer" : "whitespace", "filter" : ["elision"] } + } + } + } +} +-------------------------------------------------- + +[[analysis-elision-tokenfilter-configure-parms]] +==== Configurable parameters + +[[analysis-elision-tokenfilter-articles]] +`articles`:: ++ +-- +(Required+++*+++, array of string) +List of elisions to remove. + +To be removed, the elision must be at the beginning of a token and be +immediately followed by an apostrophe. Both the elision and apostrophe are +removed. + +For custom `elision` filters, either this parameter or `articles_path` must be +specified. +-- + +`articles_path`:: ++ +-- +(Required+++*+++, string) +Path to a file that contains a list of elisions to remove. + +This path must be absolute or relative to the `config` location, and the file +must be UTF-8 encoded. Each elision in the file must be separated by a line +break. + +To be removed, the elision must be at the beginning of a token and be +immediately followed by an apostrophe. Both the elision and apostrophe are +removed. + +For custom `elision` filters, either this parameter or `articles` must be +specified. +-- + +`articles_case`:: +(Optional, boolean) +If `true`, the filter treats any provided elisions as case sensitive. +Defaults to `false`. + +[[analysis-elision-tokenfilter-customize]] +==== Customize + +To customize the `elision` filter, duplicate it to create the basis +for a new custom token filter. You can modify the filter using its configurable +parameters. + +For example, the following request creates a custom case-sensitive `elision` +filter that removes the `l'`, `m'`, `t'`, `qu'`, `n'`, `s'`, +and `j'` elisions: + +[source,console] +-------------------------------------------------- +PUT /elision_case_sensitive_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "default" : { + "tokenizer" : "whitespace", + "filter" : ["elision_case_sensitive"] + } }, "filter" : { - "elision" : { + "elision_case_sensitive" : { "type" : "elision", - "articles_case": true, - "articles" : ["l", "m", "t", "qu", "n", "s", "j"] + "articles" : ["l", "m", "t", "qu", "n", "s", "j"], + "articles_case": true } } }