From ea7ee2f189f798a70fc49a121b087b91345b4cec Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Thu, 21 Oct 2021 15:38:10 -0400 Subject: [PATCH 1/3] [ML] Update trained model docs for truncate parameter for bert tokenization --- .../apis/get-trained-models.asciidoc | 24 +++++++++++++++++++ .../apis/put-trained-models.asciidoc | 24 +++++++++++++++++++ docs/reference/ml/ml-shared.asciidoc | 18 ++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc b/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc index 10b298c76b32..df25992d1bc3 100644 --- a/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc +++ b/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc @@ -196,6 +196,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -250,6 +254,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -297,6 +305,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -367,6 +379,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -414,6 +430,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -476,6 +496,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] diff --git a/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc b/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc index 93bea72f9a0a..35315a545469 100644 --- a/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc +++ b/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc @@ -456,6 +456,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -498,6 +502,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -534,6 +542,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -593,6 +605,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -628,6 +644,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] @@ -679,6 +699,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati (Optional, integer) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length] +`truncate`:::: +(Optional, string) +include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate] + `with_special_tokens`:::: (Optional, boolean) include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens] diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc index e439080baa50..03ac1fa44162 100644 --- a/docs/reference/ml/ml-shared.asciidoc +++ b/docs/reference/ml/ml-shared.asciidoc @@ -925,6 +925,24 @@ Should the tokenization lower case the text sequence when building the tokens. end::inference-config-nlp-tokenization-bert-do-lower-case[] +tag::inference-config-nlp-tokenization-bert-truncate[] +Indicates how tokens should be truncated if they exceed `max_sequence_length`. + +The default value is `first`. ++ +-- +* `none`: No truncation takes place and instead the inference request will error. +* `first`: Only the first sequence should be truncated. If there is just one sequence, + then it will be truncated. +* `second`: Only the second sequence should be truncated. If there is just one sequence, + then it will be truncated. +-- + +NOTE: for `zero_shot_classification`, the hypothesis sequence is always the second +sequence. So, if truncation is preferred to erroring, then `first` should be used. + +end::inference-config-nlp-tokenization-bert-truncate[] + tag::inference-config-nlp-tokenization-bert-with-special-tokens[] Tokenize with special tokens. The tokens typically included in BERT-style tokenization are: + From a949d979496bbb2767217d1a388c15aad254a9a1 Mon Sep 17 00:00:00 2001 From: lcawl Date: Mon, 25 Oct 2021 13:32:36 -0700 Subject: [PATCH 2/3] [DOCS] Fixes formatting --- docs/reference/ml/ml-shared.asciidoc | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc index 03ac1fa44162..ce80bae3b2c1 100644 --- a/docs/reference/ml/ml-shared.asciidoc +++ b/docs/reference/ml/ml-shared.asciidoc @@ -927,7 +927,6 @@ end::inference-config-nlp-tokenization-bert-do-lower-case[] tag::inference-config-nlp-tokenization-bert-truncate[] Indicates how tokens should be truncated if they exceed `max_sequence_length`. - The default value is `first`. + -- From 0587c8996ef52561359040e95a055367f19b1178 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Tue, 26 Oct 2021 07:29:26 -0400 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Lisa Cawley --- docs/reference/ml/ml-shared.asciidoc | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc index ce80bae3b2c1..46e9aaa63ecb 100644 --- a/docs/reference/ml/ml-shared.asciidoc +++ b/docs/reference/ml/ml-shared.asciidoc @@ -926,19 +926,18 @@ the tokens. end::inference-config-nlp-tokenization-bert-do-lower-case[] tag::inference-config-nlp-tokenization-bert-truncate[] -Indicates how tokens should be truncated if they exceed `max_sequence_length`. +Indicates how tokens are truncated when they exceed `max_sequence_length`. The default value is `first`. + -- -* `none`: No truncation takes place and instead the inference request will error. -* `first`: Only the first sequence should be truncated. If there is just one sequence, - then it will be truncated. -* `second`: Only the second sequence should be truncated. If there is just one sequence, - then it will be truncated. +* `none`: No truncation occurs; the inference request receives an error. +* `first`: Only the first sequence is truncated. +* `second`: Only the second sequence is truncated. If there is just one sequence, + that sequence is truncated. -- -NOTE: for `zero_shot_classification`, the hypothesis sequence is always the second -sequence. So, if truncation is preferred to erroring, then `first` should be used. +NOTE: For `zero_shot_classification`, the hypothesis sequence is always the second +sequence. Therefore, do not use `second` in this case. end::inference-config-nlp-tokenization-bert-truncate[]