From ea82dfb79ccc5da8a1311e9e22586d9113287f54 Mon Sep 17 00:00:00 2001 From: James Rodewig Date: Wed, 30 Oct 2019 17:14:05 -0400 Subject: [PATCH 1/2] [DOCS] Reformat decimal digit token filter --- .../decimal-digit-tokenfilter.asciidoc | 138 +++++++++++++++++- 1 file changed, 136 insertions(+), 2 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/decimal-digit-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/decimal-digit-tokenfilter.asciidoc index 8dede54d0d264..14a2542a08aa5 100644 --- a/docs/reference/analysis/tokenfilters/decimal-digit-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/decimal-digit-tokenfilter.asciidoc @@ -1,4 +1,138 @@ [[analysis-decimal-digit-tokenfilter]] -=== Decimal Digit Token Filter +=== Decimal digit token filter +++++ +Decimal digit +++++ -The `decimal_digit` token filter folds unicode digits to `0-9` +Converts all digits in the Unicode `Decimal_Number` General Category to `0-9`. +For example, the filter changes the Bengali numeral `৩` to `3`. + +This filter uses Lucene's +https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysiscore/DecimalDigitFilter.html[DecimalDigitFilter]. + +[[analysis-decimal-digit-tokenfilter-analyze-ex]] +==== Example + +The following <> request uses the `decimal_digit` +filter to convert Devanagari numerals to `0-9`: + +[source,console] +-------------------------------------------------- +GET /_analyze +{ + "tokenizer" : "whitespace", + "filter" : ["decimal_digit"], + "text" : "० १ २ ३ ४ ५ ६ ७ ८ ९" +} +-------------------------------------------------- + +The filter produces the following tokens: + +[source,text] +-------------------------------------------------- +[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ] +-------------------------------------------------- + +///////////////////// +[source,console-result] +-------------------------------------------------- +{ + "tokens" : [ + { + "token" : "0", + "start_offset" : 0, + "end_offset" : 1, + "type" : "word", + "position" : 0 + }, + { + "token" : "1", + "start_offset" : 2, + "end_offset" : 3, + "type" : "word", + "position" : 1 + }, + { + "token" : "2", + "start_offset" : 4, + "end_offset" : 5, + "type" : "word", + "position" : 2 + }, + { + "token" : "3", + "start_offset" : 6, + "end_offset" : 7, + "type" : "word", + "position" : 3 + }, + { + "token" : "4", + "start_offset" : 8, + "end_offset" : 9, + "type" : "word", + "position" : 4 + }, + { + "token" : "5", + "start_offset" : 10, + "end_offset" : 11, + "type" : "word", + "position" : 5 + }, + { + "token" : "6", + "start_offset" : 12, + "end_offset" : 13, + "type" : "word", + "position" : 6 + }, + { + "token" : "7", + "start_offset" : 14, + "end_offset" : 15, + "type" : "word", + "position" : 7 + }, + { + "token" : "8", + "start_offset" : 16, + "end_offset" : 17, + "type" : "word", + "position" : 8 + }, + { + "token" : "9", + "start_offset" : 18, + "end_offset" : 19, + "type" : "word", + "position" : 9 + } + ] +} +-------------------------------------------------- +///////////////////// + +[[analysis-decimal-digit-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +`decimal_digit` filter to configure a new +<>. + +[source,console] +-------------------------------------------------- +PUT /decimal_digit_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "whitespace_decimal_digit" : { + "tokenizer" : "whitespace", + "filter" : ["decimal_digit"] + } + } + } + } +} +-------------------------------------------------- From f7416cd92be83f7c9548c9291384300670561319 Mon Sep 17 00:00:00 2001 From: James Rodewig Date: Fri, 1 Nov 2019 12:13:22 -0400 Subject: [PATCH 2/2] iter --- .../decimal-digit-tokenfilter.asciidoc | 65 +++---------------- 1 file changed, 8 insertions(+), 57 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/decimal-digit-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/decimal-digit-tokenfilter.asciidoc index 14a2542a08aa5..58303185c1351 100644 --- a/docs/reference/analysis/tokenfilters/decimal-digit-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/decimal-digit-tokenfilter.asciidoc @@ -22,7 +22,7 @@ GET /_analyze { "tokenizer" : "whitespace", "filter" : ["decimal_digit"], - "text" : "० १ २ ३ ४ ५ ६ ७ ८ ९" + "text" : "१-one two-२ ३" } -------------------------------------------------- @@ -30,7 +30,7 @@ The filter produces the following tokens: [source,text] -------------------------------------------------- -[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ] +[ 1-one, two-2, 3] -------------------------------------------------- ///////////////////// @@ -39,74 +39,25 @@ The filter produces the following tokens: { "tokens" : [ { - "token" : "0", + "token" : "1-one", "start_offset" : 0, - "end_offset" : 1, - "type" : "word", - "position" : 0 - }, - { - "token" : "1", - "start_offset" : 2, - "end_offset" : 3, - "type" : "word", - "position" : 1 - }, - { - "token" : "2", - "start_offset" : 4, "end_offset" : 5, "type" : "word", - "position" : 2 + "position" : 0 }, { - "token" : "3", + "token" : "two-2", "start_offset" : 6, - "end_offset" : 7, - "type" : "word", - "position" : 3 - }, - { - "token" : "4", - "start_offset" : 8, - "end_offset" : 9, - "type" : "word", - "position" : 4 - }, - { - "token" : "5", - "start_offset" : 10, "end_offset" : 11, "type" : "word", - "position" : 5 + "position" : 1 }, { - "token" : "6", + "token" : "3", "start_offset" : 12, "end_offset" : 13, "type" : "word", - "position" : 6 - }, - { - "token" : "7", - "start_offset" : 14, - "end_offset" : 15, - "type" : "word", - "position" : 7 - }, - { - "token" : "8", - "start_offset" : 16, - "end_offset" : 17, - "type" : "word", - "position" : 8 - }, - { - "token" : "9", - "start_offset" : 18, - "end_offset" : 19, - "type" : "word", - "position" : 9 + "position" : 2 } ] }