From ea7ee2f189f798a70fc49a121b087b91345b4cec Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Thu, 21 Oct 2021 15:38:10 -0400
Subject: [PATCH 1/3] [ML] Update trained model docs for truncate parameter for
 bert tokenization

---
 .../apis/get-trained-models.asciidoc          | 24 +++++++++++++++++++
 .../apis/put-trained-models.asciidoc          | 24 +++++++++++++++++++
 docs/reference/ml/ml-shared.asciidoc          | 18 ++++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc b/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc
index 10b298c76b32..df25992d1bc3 100644
--- a/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc
+++ b/docs/reference/ml/df-analytics/apis/get-trained-models.asciidoc
@@ -196,6 +196,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -250,6 +254,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -297,6 +305,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -367,6 +379,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -414,6 +430,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -476,6 +496,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
diff --git a/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc b/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc
index 93bea72f9a0a..35315a545469 100644
--- a/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc
+++ b/docs/reference/ml/df-analytics/apis/put-trained-models.asciidoc
@@ -456,6 +456,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -498,6 +502,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -534,6 +542,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -593,6 +605,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -628,6 +644,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -679,6 +699,10 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 (Optional, integer)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
 
+`truncate`::::
+(Optional, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
index e439080baa50..03ac1fa44162 100644
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@@ -925,6 +925,24 @@ Should the tokenization lower case the text sequence when building
 the tokens.
 end::inference-config-nlp-tokenization-bert-do-lower-case[]
 
+tag::inference-config-nlp-tokenization-bert-truncate[]
+Indicates how tokens should be truncated if they exceed `max_sequence_length`.
+
+The default value is `first`.
++
+--
+* `none`: No truncation takes place and instead the inference request will error.
+* `first`: Only the first sequence should be truncated. If there is just one sequence,
+					 then it will be truncated.
+* `second`: Only the second sequence should be truncated. If there is just one sequence,
+					 then it will be truncated.
+--
+
+NOTE: for `zero_shot_classification`, the hypothesis sequence is always the second 
+sequence. So, if truncation is preferred to erroring, then `first` should be used.
+
+end::inference-config-nlp-tokenization-bert-truncate[]
+
 tag::inference-config-nlp-tokenization-bert-with-special-tokens[]
 Tokenize with special tokens. The tokens typically included in BERT-style tokenization are:
 +

From a949d979496bbb2767217d1a388c15aad254a9a1 Mon Sep 17 00:00:00 2001
From: lcawl <lcawley@elastic.co>
Date: Mon, 25 Oct 2021 13:32:36 -0700
Subject: [PATCH 2/3] [DOCS] Fixes formatting

---
 docs/reference/ml/ml-shared.asciidoc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
index 03ac1fa44162..ce80bae3b2c1 100644
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@@ -927,7 +927,6 @@ end::inference-config-nlp-tokenization-bert-do-lower-case[]
 
 tag::inference-config-nlp-tokenization-bert-truncate[]
 Indicates how tokens should be truncated if they exceed `max_sequence_length`.
-
 The default value is `first`.
 +
 --

From 0587c8996ef52561359040e95a055367f19b1178 Mon Sep 17 00:00:00 2001
From: Benjamin Trent <ben.w.trent@gmail.com>
Date: Tue, 26 Oct 2021 07:29:26 -0400
Subject: [PATCH 3/3] Apply suggestions from code review

Co-authored-by: Lisa Cawley <lcawley@elastic.co>
---
 docs/reference/ml/ml-shared.asciidoc | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
index ce80bae3b2c1..46e9aaa63ecb 100644
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@@ -926,19 +926,18 @@ the tokens.
 end::inference-config-nlp-tokenization-bert-do-lower-case[]
 
 tag::inference-config-nlp-tokenization-bert-truncate[]
-Indicates how tokens should be truncated if they exceed `max_sequence_length`.
+Indicates how tokens are truncated when they exceed `max_sequence_length`.
 The default value is `first`.
 +
 --
-* `none`: No truncation takes place and instead the inference request will error.
-* `first`: Only the first sequence should be truncated. If there is just one sequence,
-					 then it will be truncated.
-* `second`: Only the second sequence should be truncated. If there is just one sequence,
-					 then it will be truncated.
+* `none`: No truncation occurs; the inference request receives an error.
+* `first`: Only the first sequence is truncated.
+* `second`: Only the second sequence is truncated. If there is just one sequence,
+					 that sequence is truncated.
 --
 
-NOTE: for `zero_shot_classification`, the hypothesis sequence is always the second 
-sequence. So, if truncation is preferred to erroring, then `first` should be used.
+NOTE: For `zero_shot_classification`, the hypothesis sequence is always the second 
+sequence. Therefore, do not use `second` in this case.
 
 end::inference-config-nlp-tokenization-bert-truncate[]