Merge branch 'main' into doc-test-ccs-feature

smalyshev · Sep 18, 2024 · 7ec3715 · 7ec3715
2 parents 8c37dae + 90e343c
commit 7ec3715
Show file tree

Hide file tree

Showing 172 changed files with 5,819 additions and 1,426 deletions.
diff --git a/.buildkite/scripts/lucene-snapshot/update-branch.sh b/.buildkite/scripts/lucene-snapshot/update-branch.sh
@@ -7,21 +7,12 @@ if [[ "$BUILDKITE_BRANCH" != "lucene_snapshot"* ]]; then
   exit 1
 fi
 
-if [[ "$BUILDKITE_BRANCH" == "lucene_snapshot_10" ]]; then
-  UPSTREAM="main"
-elif [[ "$BUILDKITE_BRANCH" == "lucene_snapshot" ]]; then
-  UPSTREAM="8.x"
-else
-  echo "Error: unknown branch: $BUILDKITE_BRANCH"
-  exit 1
-fi
-
-echo --- Updating "$BUILDKITE_BRANCH" branch with "$UPSTREAM"
+echo --- Updating "$BUILDKITE_BRANCH" branch with main
 
 git config --global user.name elasticsearchmachine
 git config --global user.email '[email protected]'
 
 git checkout "$BUILDKITE_BRANCH"
-git fetch origin "$UPSTREAM"
-git merge --no-edit "origin/$UPSTREAM"
+git fetch origin main
+git merge --no-edit origin/main
 git push origin "$BUILDKITE_BRANCH"
diff --git a/distribution/tools/geoip-cli/src/test/java/org/elasticsearch/geoip/GeoIpCliTests.java b/distribution/tools/geoip-cli/src/test/java/org/elasticsearch/geoip/GeoIpCliTests.java
@@ -20,7 +20,6 @@
 import org.elasticsearch.xcontent.XContentParserConfiguration;
 import org.elasticsearch.xcontent.XContentType;
 
-import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -106,9 +105,7 @@ private void verifyOverview() throws Exception {
     private void verifyTarball(Map<String, byte[]> data) throws Exception {
         for (String tgz : List.of("a.tgz", "b.tgz")) {
             try (
-                TarArchiveInputStream tis = new TarArchiveInputStream(
-                    new GZIPInputStream(new BufferedInputStream(Files.newInputStream(target.resolve(tgz))))
-                )
+                TarArchiveInputStream tis = new TarArchiveInputStream(new GZIPInputStream(Files.newInputStream(target.resolve(tgz)), 8192))
             ) {
                 TarArchiveEntry entry = tis.getNextTarEntry();
                 assertNotNull(entry);

diff --git a/docs/changelog/111770.yaml b/docs/changelog/111770.yaml
@@ -0,0 +1,5 @@
+pr: 111770
+summary: Integrate IBM watsonx to Inference API for text embeddings
+area: Experiences
+type: enhancement
+issues: []
diff --git a/docs/changelog/112565.yaml b/docs/changelog/112565.yaml
@@ -0,0 +1,5 @@
+pr: 112565
+summary: Server-Sent Events for Inference response
+area: Machine Learning
+type: enhancement
+issues: []
diff --git a/docs/changelog/112874.yaml b/docs/changelog/112874.yaml
@@ -0,0 +1,5 @@
+pr: 112874
+summary: Reduce heap usage for `AggregatorsReducer`
+area: Aggregations
+type: enhancement
+issues: []
diff --git a/docs/reference/search/search-your-data/semantic-search-elser.asciidoc b/docs/reference/search/search-your-data/semantic-search-elser.asciidoc
@@ -117,15 +117,15 @@ All unique passages, along with their IDs, have been extracted from that data se
 https://github.com/elastic/stack-docs/blob/main/docs/en/stack/ml/nlp/data/msmarco-passagetest2019-unique.tsv[tsv file].
 
 IMPORTANT: The `msmarco-passagetest2019-top1000` dataset was not utilized to train the model.
-It is only used in this tutorial as a sample dataset that is easily accessible for demonstration purposes.
+We use this sample dataset in the tutorial because is easily accessible for demonstration purposes.
 You can use a different data set to test the workflow and become familiar with it.
 
-Download the file and upload it to your cluster using the
-{kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[Data Visualizer]
-in the {ml-app} UI.
-Assign the name `id` to the first column and `content` to the second column.
-The index name is `test-data`.
-Once the upload is complete, you can see an index named `test-data` with 182469 documents.
+Download the file and upload it to your cluster using the {kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[File Uploader] in the UI.
+After your data is analyzed, click **Override settings**.
+Under **Edit field names**, assign `id` to the first column and `content` to the second.
+Click **Apply**, then **Import**.
+Name the index `test-data`, and click **Import**.
+After the upload is complete, you will see an index named `test-data` with 182,469 documents.
 
 [discrete]
 [[reindexing-data-elser]]
@@ -161,6 +161,18 @@ GET _tasks/<task_id>
 
 You can also open the Trained Models UI, select the Pipelines tab under ELSER to follow the progress.
 
+Reindexing large datasets can take a long time.
+You can test this workflow using only a subset of the dataset.
+Do this by cancelling the reindexing process, and only generating embeddings for the subset that was reindexed.
+The following API request will cancel the reindexing task:
+
+[source,console]
+----
+POST _tasks/<task_id>/_cancel
+----
+// TEST[skip:TBD]
+
+
 [discrete]
 [[text-expansion-query]]
 ==== Semantic search by using the `sparse_vector` query

diff --git a/docs/reference/search/search-your-data/semantic-search-inference.asciidoc b/docs/reference/search/search-your-data/semantic-search-inference.asciidoc
@@ -68,12 +68,12 @@ It consists of 200 queries, each accompanied by a list of relevant text passages
 All unique passages, along with their IDs, have been extracted from that data set and compiled into a
 https://github.com/elastic/stack-docs/blob/main/docs/en/stack/ml/nlp/data/msmarco-passagetest2019-unique.tsv[tsv file].
 
-Download the file and upload it to your cluster using the
-{kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[Data Visualizer]
-in the {ml-app} UI.
-Assign the name `id` to the first column and `content` to the second column.
-The index name is `test-data`.
-Once the upload is complete, you can see an index named `test-data` with 182469 documents.
+Download the file and upload it to your cluster using the {kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[Data Visualizer] in the {ml-app} UI.
+After your data is analyzed, click **Override settings**.
+Under **Edit field names**, assign `id` to the first column and `content` to the second.
+Click **Apply**, then **Import**.
+Name the index `test-data`, and click **Import**.
+After the upload is complete, you will see an index named `test-data` with 182,469 documents.
 
 [discrete]
 [[reindexing-data-infer]]
@@ -92,7 +92,10 @@ GET _tasks/<task_id>
 ----
 // TEST[skip:TBD]
 
-You can also cancel the reindexing process if you don't want to wait until the reindexing process is fully complete which might take hours for large data sets:
+Reindexing large datasets can take a long time.
+You can test this workflow using only a subset of the dataset.
+Do this by cancelling the reindexing process, and only generating embeddings for the subset that was reindexed.
+The following API request will cancel the reindexing task:
 
 [source,console]
 ----

diff --git a/docs/reference/search/search-your-data/semantic-search-semantic-text.asciidoc b/docs/reference/search/search-your-data/semantic-search-semantic-text.asciidoc
@@ -96,11 +96,12 @@ a list of relevant text passages. All unique passages, along with their IDs,
 have been extracted from that data set and compiled into a
 https://github.com/elastic/stack-docs/blob/main/docs/en/stack/ml/nlp/data/msmarco-passagetest2019-unique.tsv[tsv file].
 
-Download the file and upload it to your cluster using the
-{kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[Data Visualizer]
-in the {ml-app} UI. Assign the name `id` to the first column and `content` to
-the second column. The index name is `test-data`. Once the upload is complete,
-you can see an index named `test-data` with 182469 documents.
+Download the file and upload it to your cluster using the {kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[Data Visualizer] in the {ml-app} UI.
+After your data is analyzed, click **Override settings**.
+Under **Edit field names**, assign `id` to the first column and `content` to the second.
+Click **Apply**, then **Import**.
+Name the index `test-data`, and click **Import**.
+After the upload is complete, you will see an index named `test-data` with 182,469 documents.
 
 
 [discrete]
@@ -137,8 +138,10 @@ GET _tasks/<task_id>
 ------------------------------------------------------------
 // TEST[skip:TBD]
 
-It is recommended to cancel the reindexing process if you don't want to wait
-until it is fully complete which might take a long time for an inference endpoint with few assigned resources:
+Reindexing large datasets can take a long time.
+You can test this workflow using only a subset of the dataset.
+Do this by cancelling the reindexing process, and only generating embeddings for the subset that was reindexed.
+The following API request will cancel the reindexing task:
 
 [source,console]
 ------------------------------------------------------------

diff --git a/docs/reference/security/authentication/saml-guide.asciidoc b/docs/reference/security/authentication/saml-guide.asciidoc
@@ -32,7 +32,7 @@ that supports at least the SAML 2.0 _Web Browser SSO Profile_.
 It has been tested with a number of popular IdP implementations, such as
 https://www.elastic.co/blog/how-to-configure-elasticsearch-saml-authentication-with-adfs[Microsoft Active Directory Federation Services (ADFS)],
 https://www.elastic.co/blog/saml-based-single-sign-on-with-elasticsearch-and-azure-active-directory[Azure Active Directory (AAD)],
-and https://www.elastic.co/blog/setting-up-saml-for-elastic-enterprise-search-okta-edition[Okta].
+and https://www.elastic.co/blog/how-to-set-up-okta-saml-login-kibana-elastic-cloud[Okta].
 
 This guide assumes that you have an existing IdP and wish to add {kib} as a
 Service Provider.

diff --git a/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -101,12 +101,7 @@
 import org.apache.lucene.analysis.tr.TurkishAnalyzer;
 import org.apache.lucene.analysis.util.ElisionFilter;
 import org.apache.lucene.util.SetOnce;
-import org.elasticsearch.common.logging.DeprecationCategory;
-import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.regex.Regex;
-import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.analysis.AnalyzerProvider;
 import org.elasticsearch.index.analysis.CharFilterFactory;
@@ -139,8 +134,6 @@
 
 public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, ScriptPlugin {
 
-    private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(CommonAnalysisPlugin.class);
-
     private final SetOnce<ScriptService> scriptServiceHolder = new SetOnce<>();
     private final SetOnce<SynonymsManagementAPIService> synonymsManagementServiceHolder = new SetOnce<>();
 
@@ -231,28 +224,6 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         filters.put("dictionary_decompounder", requiresAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
         filters.put("dutch_stem", DutchStemTokenFilterFactory::new);
         filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
-        filters.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
-            return new EdgeNGramTokenFilterFactory(indexSettings, environment, name, settings) {
-                @Override
-                public TokenStream create(TokenStream tokenStream) {
-                    if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_8_0_0)) {
-                        throw new IllegalArgumentException(
-                            "The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
-                                + "Please change the filter name to [edge_ngram] instead."
-                        );
-                    } else {
-                        deprecationLogger.warn(
-                            DeprecationCategory.ANALYSIS,
-                            "edgeNGram_deprecation",
-                            "The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
-                                + "Please change the filter name to [edge_ngram] instead."
-                        );
-                    }
-                    return super.create(tokenStream);
-                }
-
-            };
-        });
         filters.put("elision", requiresAnalysisSettings(ElisionTokenFilterFactory::new));
         filters.put("fingerprint", FingerprintTokenFilterFactory::new);
         filters.put("flatten_graph", FlattenGraphTokenFilterFactory::new);
@@ -272,28 +243,6 @@ public TokenStream create(TokenStream tokenStream) {
         filters.put("min_hash", MinHashTokenFilterFactory::new);
         filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
         filters.put("ngram", NGramTokenFilterFactory::new);
-        filters.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
-            return new NGramTokenFilterFactory(indexSettings, environment, name, settings) {
-                @Override
-                public TokenStream create(TokenStream tokenStream) {
-                    if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_8_0_0)) {
-                        throw new IllegalArgumentException(
-                            "The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
-                                + "Please change the filter name to [ngram] instead."
-                        );
-                    } else {
-                        deprecationLogger.warn(
-                            DeprecationCategory.ANALYSIS,
-                            "nGram_deprecation",
-                            "The [nGram] token filter name is deprecated and will be removed in a future version. "
-                                + "Please change the filter name to [ngram] instead."
-                        );
-                    }
-                    return super.create(tokenStream);
-                }
-
-            };
-        });
         filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
         filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
         filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
@@ -345,39 +294,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
         tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
         tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
         tokenizers.put("thai", ThaiTokenizerFactory::new);
-        tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
-            if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_8_0_0)) {
-                throw new IllegalArgumentException(
-                    "The [nGram] tokenizer name was deprecated in 7.6. "
-                        + "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead."
-                );
-            } else if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_7_6_0)) {
-                deprecationLogger.warn(
-                    DeprecationCategory.ANALYSIS,
-                    "nGram_tokenizer_deprecation",
-                    "The [nGram] tokenizer name is deprecated and will be removed in a future version. "
-                        + "Please change the tokenizer name to [ngram] instead."
-                );
-            }
-            return new NGramTokenizerFactory(indexSettings, environment, name, settings);
-        });
         tokenizers.put("ngram", NGramTokenizerFactory::new);
-        tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
-            if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_8_0_0)) {
-                throw new IllegalArgumentException(
-                    "The [edgeNGram] tokenizer name was deprecated in 7.6. "
-                        + "Please use the tokenizer name to [edge_nGram] for indices created in versions 8 or higher instead."
-                );
-            } else if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_7_6_0)) {
-                deprecationLogger.warn(
-                    DeprecationCategory.ANALYSIS,
-                    "edgeNGram_tokenizer_deprecation",
-                    "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
-                        + "Please change the tokenizer name to [edge_ngram] instead."
-                );
-            }
-            return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings);
-        });
         tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
         tokenizers.put("char_group", CharGroupTokenizerFactory::new);
         tokenizers.put("classic", ClassicTokenizerFactory::new);
@@ -588,54 +505,17 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
         tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new));
         tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new));
         tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new));
-        tokenizers.add(PreConfiguredTokenizer.indexVersion("edge_ngram", (version) -> {
-            if (version.onOrAfter(IndexVersions.V_7_3_0)) {
-                return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
-            }
-            return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
-        }));
+        tokenizers.add(
+            PreConfiguredTokenizer.indexVersion(
+                "edge_ngram",
+                (version) -> new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE)
+            )
+        );
         tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1)));
         tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new));
         // TODO deprecate and remove in API
         // This is already broken with normalization, so backwards compat isn't necessary?
         tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new));
-
-        // Temporary shim for aliases. TODO deprecate after they are moved
-        tokenizers.add(PreConfiguredTokenizer.indexVersion("nGram", (version) -> {
-            if (version.onOrAfter(IndexVersions.V_8_0_0)) {
-                throw new IllegalArgumentException(
-                    "The [nGram] tokenizer name was deprecated in 7.6. "
-                        + "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead."
-                );
-            } else if (version.onOrAfter(IndexVersions.V_7_6_0)) {
-                deprecationLogger.warn(
-                    DeprecationCategory.ANALYSIS,
-                    "nGram_tokenizer_deprecation",
-                    "The [nGram] tokenizer name is deprecated and will be removed in a future version. "
-                        + "Please change the tokenizer name to [ngram] instead."
-                );
-            }
-            return new NGramTokenizer();
-        }));
-        tokenizers.add(PreConfiguredTokenizer.indexVersion("edgeNGram", (version) -> {
-            if (version.onOrAfter(IndexVersions.V_8_0_0)) {
-                throw new IllegalArgumentException(
-                    "The [edgeNGram] tokenizer name was deprecated in 7.6. "
-                        + "Please use the tokenizer name to [edge_ngram] for indices created in versions 8 or higher instead."
-                );
-            } else if (version.onOrAfter(IndexVersions.V_7_6_0)) {
-                deprecationLogger.warn(
-                    DeprecationCategory.ANALYSIS,
-                    "edgeNGram_tokenizer_deprecation",
-                    "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
-                        + "Please change the tokenizer name to [edge_ngram] instead."
-                );
-            }
-            if (version.onOrAfter(IndexVersions.V_7_3_0)) {
-                return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
-            }
-            return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
-        }));
         tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new));
 
         return tokenizers;