diff --git a/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc index 83dcfd2d380da..3ba59dcff13ae 100644 --- a/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc +++ b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc @@ -1,29 +1,44 @@ [[analysis-htmlstrip-charfilter]] -=== HTML Strip Char Filter +=== HTML strip character filter +++++ +HTML strip +++++ -The `html_strip` character filter strips HTML elements from the text and -replaces HTML entities with their decoded value (e.g. replacing `&` with -`&`). +Strips HTML elements from a text and replaces HTML entities with their decoded +value (e.g, replaces `&` with `&`). -[float] -=== Example output +The `html_strip` filter uses Lucene's +{lucene-analysis-docs}/charfilter/HTMLStripCharFilter.html[HTMLStripCharFilter]. + +[[analysis-htmlstrip-charfilter-analyze-ex]] +==== Example + +The following <> request uses the +`html_strip` filter to change the text `

I'm so happy!

` to +`\nI'm so happy!\n`. [source,console] ---------------------------- -POST _analyze +---- +GET /_analyze { - "tokenizer": "keyword", <1> - "char_filter": [ "html_strip" ], + "tokenizer": "keyword", + "char_filter": [ + "html_strip" + ], "text": "

I'm so happy!

" } ---------------------------- +---- -<1> The <> returns a single term. +The filter produces the following text: -///////////////////// +[source,text] +---- +[ \nI'm so happy!\n ] +---- +//// [source,console-result] ----------------------------- +---- { "tokens": [ { @@ -35,43 +50,60 @@ POST _analyze } ] } ----------------------------- - -///////////////////// - +---- +//// -The above example returns the term: +[[analysis-htmlstrip-charfilter-analyzer-ex]] +==== Add to an analyzer -[source,text] ---------------------------- -[ \nI'm so happy!\n ] ---------------------------- - -The same example with the `standard` tokenizer would return the following terms: +The following <> request uses the +`html_strip` filter to configure a new +<>. -[source,text] ---------------------------- -[ I'm, so, happy ] ---------------------------- - -[float] -=== Configuration +[source,console] +---- +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "keyword", + "char_filter": [ + "html_strip" + ] + } + } + } + } +} +---- -The `html_strip` character filter accepts the following parameter: +[[analysis-htmlstrip-charfilter-configure-parms]] +==== Configurable parameters -[horizontal] `escaped_tags`:: +(Optional, array of strings) +Array of HTML elements without enclosing angle brackets (`< >`). The filter +skips these HTML elements when stripping HTML from the text. For example, a +value of `[ "p" ]` skips the `

` HTML element. - An array of HTML tags which should not be stripped from the original text. +[[analysis-htmlstrip-charfilter-customize]] +==== Customize -[float] -=== Example configuration +To customize the `html_strip` filter, duplicate it to create the basis +for a new custom token filter. You can modify the filter using its configurable +parameters. -In this example, we configure the `html_strip` character filter to leave `` -tags in place: +The following <> request +configures a new <> using a custom +`html_strip` filter, `my_custom_html_strip_char_filter`. + +The `my_custom_html_strip_char_filter` filter skips the removal of the `` +HTML element. [source,console] ----------------------------- +---- PUT my_index { "settings": { @@ -79,49 +111,20 @@ PUT my_index "analyzer": { "my_analyzer": { "tokenizer": "keyword", - "char_filter": ["my_char_filter"] + "char_filter": [ + "my_custom_html_strip_char_filter" + ] } }, "char_filter": { - "my_char_filter": { + "my_custom_html_strip_char_filter": { "type": "html_strip", - "escaped_tags": ["b"] + "escaped_tags": [ + "b" + ] } } } } } - -POST my_index/_analyze -{ - "analyzer": "my_analyzer", - "text": "

I'm so happy!

" -} ----------------------------- - -///////////////////// - -[source,console-result] ----------------------------- -{ - "tokens": [ - { - "token": "\nI'm so happy!\n", - "start_offset": 0, - "end_offset": 32, - "type": "word", - "position": 0 - } - ] -} ----------------------------- - -///////////////////// - - -The above example produces the following term: - -[source,text] ---------------------------- -[ \nI'm so happy!\n ] ---------------------------- +----