diff --git a/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc index 1cebf95033844..e0628c8086961 100644 --- a/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc @@ -1,21 +1,323 @@ [[analysis-delimited-payload-tokenfilter]] -=== Delimited Payload Token Filter - -Named `delimited_payload`. Splits tokens into tokens and payload whenever a delimiter character is found. +=== Delimited payload token filter +++++ +Delimited payload +++++ [WARNING] -============================================ +==== +The older name `delimited_payload_filter` is deprecated and should not be used +with new indices. Use `delimited_payload` instead. +==== + +Separates a token stream into tokens and payloads based on a specified +delimiter. + +For example, you can use the `delimited_payload` filter with a `|` delimiter to +split `the|1 quick|2 fox|3` into the tokens `the`, `quick`, and `fox` +with respective payloads of `1`, `2`, and `3`. + +This filter uses Lucene's +https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html[DelimitedPayloadTokenFilter]. + +[NOTE] +.Payloads +==== +A payload is user-defined binary data associated with a token position and +stored as base64-encoded bytes. + +{es} does not store token payloads by default. To store payloads, you must: + +* Set the <> mapping parameter to + `with_positions_payloads` or `with_positions_offsets_payloads` for any field + storing payloads. +* Use an index analyzer that includes the `delimited_payload` filter + +You can view stored payloads using the <>. +==== + +[[analysis-delimited-payload-tokenfilter-analyze-ex]] +==== Example + +The following <> request uses the +`delimited_payload` filter with the default `|` delimiter to split +`the|0 brown|10 fox|5 is|0 quick|10` into tokens and payloads. + +[source,console] +-------------------------------------------------- +GET _analyze +{ + "tokenizer": "whitespace", + "filter": ["delimited_payload"], + "text": "the|0 brown|10 fox|5 is|0 quick|10" +} +-------------------------------------------------- + +The filter produces the following tokens: + +[source,text] +-------------------------------------------------- +[ the, brown, fox, is, quick ] +-------------------------------------------------- + +Note that the analyze API does not return stored payloads. For an example that +includes returned payloads, see +<>. + +///////////////////// +[source,console-result] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 5, + "type": "word", + "position": 0 + }, + { + "token": "brown", + "start_offset": 6, + "end_offset": 14, + "type": "word", + "position": 1 + }, + { + "token": "fox", + "start_offset": 15, + "end_offset": 20, + "type": "word", + "position": 2 + }, + { + "token": "is", + "start_offset": 21, + "end_offset": 25, + "type": "word", + "position": 3 + }, + { + "token": "quick", + "start_offset": 26, + "end_offset": 34, + "type": "word", + "position": 4 + } + ] +} +-------------------------------------------------- +///////////////////// + +[[analysis-delimited-payload-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +`delimited-payload` filter to configure a new <>. + +[source,console] +-------------------------------------------------- +PUT delimited_payload +{ + "settings": { + "analysis": { + "analyzer": { + "whitespace_delimited_payload": { + "tokenizer": "whitespace", + "filter": [ "delimited_payload" ] + } + } + } + } +} +-------------------------------------------------- + +[[analysis-delimited-payload-tokenfilter-configure-parms]] +==== Configurable parameters + +`delimiter`:: +(Optional, string) +Character used to separate tokens from payloads. Defaults to `|`. + +`encoding`:: ++ +-- +(Optional, string) +Datatype for the stored payload. Valid values are: + +`float`::: +(Default) Float + +`identity`::: +Characters + +`int`::: +Integer +-- + +[[analysis-delimited-payload-tokenfilter-customize]] +==== Customize and add to an analyzer + +To customize the `delimited_payload` filter, duplicate it to create the basis +for a new custom token filter. You can modify the filter using its configurable +parameters. + +For example, the following <> request +uses a custom `delimited_payload` filter to configure a new +<>. The custom `delimited_payload` +filter uses the `+` delimiter to separate tokens from payloads. Payloads are +encoded as integers. + +[source,console] +-------------------------------------------------- +PUT delimited_payload_example +{ + "settings": { + "analysis": { + "analyzer": { + "whitespace_plus_delimited": { + "tokenizer": "whitespace", + "filter": [ "plus_delimited" ] + } + }, + "filter": { + "plus_delimited": { + "type": "delimited_payload", + "delimiter": "+", + "encoding": "int" + } + } + } + } +} +-------------------------------------------------- + +[[analysis-delimited-payload-tokenfilter-return-stored-payloads]] +==== Return stored payloads + +Use the <> to create an index that: + +* Includes a field that stores term vectors with payloads. +* Uses a <> with the + `delimited_payload` filter. + +[source,console] +-------------------------------------------------- +PUT text_payloads +{ + "mappings": { + "properties": { + "text": { + "type": "text", + "term_vector": "with_positions_payloads", + "analyzer": "payload_delimiter" + } + } + }, + "settings": { + "analysis": { + "analyzer": { + "payload_delimiter": { + "tokenizer": "whitespace", + "filter": [ "delimited_payload" ] + } + } + } + } +} +-------------------------------------------------- -The older name `delimited_payload_filter` is deprecated and should not be used for new indices. Use `delimited_payload` instead. +Add a document containing payloads to the index. -============================================ +[source,console] +-------------------------------------------------- +POST text_payloads/_doc/1 +{ + "text": "the|0 brown|3 fox|4 is|0 quick|10" +} +-------------------------------------------------- +// TEST[continued] -Example: "the|1 quick|2 fox|3" is split by default into tokens `the`, `quick`, and `fox` with payloads `1`, `2`, and `3` respectively. +Use the <> to return the document's tokens +and base64-encoded payloads. -Parameters: +[source,console] +-------------------------------------------------- +GET text_payloads/_termvectors/1 +{ + "fields": [ "text" ], + "payloads": true +} +-------------------------------------------------- +// TEST[continued] -`delimiter`:: - Character used for splitting the tokens. Default is `|`. +The API returns the following response: -`encoding`:: - The type of the payload. `int` for integer, `float` for float and `identity` for characters. Default is `float`. \ No newline at end of file +[source,console-result] +-------------------------------------------------- +{ + "_index": "text_payloads", + "_id": "1", + "_version": 1, + "found": true, + "took": 8, + "term_vectors": { + "text": { + "field_statistics": { + "sum_doc_freq": 5, + "doc_count": 1, + "sum_ttf": 5 + }, + "terms": { + "brown": { + "term_freq": 1, + "tokens": [ + { + "position": 1, + "payload": "QEAAAA==" + } + ] + }, + "fox": { + "term_freq": 1, + "tokens": [ + { + "position": 2, + "payload": "QIAAAA==" + } + ] + }, + "is": { + "term_freq": 1, + "tokens": [ + { + "position": 3, + "payload": "AAAAAA==" + } + ] + }, + "quick": { + "term_freq": 1, + "tokens": [ + { + "position": 4, + "payload": "QSAAAA==" + } + ] + }, + "the": { + "term_freq": 1, + "tokens": [ + { + "position": 0, + "payload": "AAAAAA==" + } + ] + } + } + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/"took": 8/"took": "$body.took"/]