Merge branch 'main' of github.com:elastic/elasticsearch into batch-th…

…e-chunks
davidkyle · Oct 28, 2024 · 19deea7 · 19deea7
2 parents 855790b + 98cd34f
commit 19deea7
Show file tree

Hide file tree

Showing 97 changed files with 4,062 additions and 1,663 deletions.
diff --git a/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle b/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle
@@ -122,6 +122,36 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') {
       .findAll { it != null }
   }
 
+  // force IntelliJ to generate *.iml files for each imported module
+  tasks.register("enableExternalConfiguration") {
+    group = 'ide'
+    description = 'Enable per-module *.iml files'
+
+    doLast {
+      modifyXml('.idea/misc.xml') {xml ->
+        def externalStorageConfig = xml.component.find { it.'@name' == 'ExternalStorageConfigurationManager' }
+        if (externalStorageConfig) {
+          xml.remove(externalStorageConfig)
+        }
+      }
+    }
+  }
+
+  // modifies the idea module config to enable preview features on 'elasticsearch-native' module
+  tasks.register("enablePreviewFeatures") {
+    group = 'ide'
+    description = 'Enables preview features on native library module'
+    dependsOn tasks.named("enableExternalConfiguration")
+
+    doLast {
+      ['main', 'test'].each { sourceSet ->
+        modifyXml(".idea/modules/libs/native/elasticsearch.libs.elasticsearch-native.${sourceSet}.iml") { xml ->
+          xml.component.find { it.'@name' == 'NewModuleRootManager' }?.'@LANGUAGE_LEVEL' = 'JDK_21_PREVIEW'
+        }
+      }
+    }
+  }
+
   tasks.register('buildDependencyArtifacts') {
     group = 'ide'
     description = 'Builds artifacts needed as dependency for IDE modules'
@@ -149,7 +179,10 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') {
           testRunner = 'choose_per_test'
         }
         taskTriggers {
-          afterSync tasks.named('configureIdeCheckstyle'), tasks.named('configureIdeaGradleJvm'), tasks.named('buildDependencyArtifacts')
+          afterSync tasks.named('configureIdeCheckstyle'),
+            tasks.named('configureIdeaGradleJvm'),
+            tasks.named('buildDependencyArtifacts'),
+            tasks.named('enablePreviewFeatures')
         }
         encodings {
           encoding = 'UTF-8'

diff --git a/...l/src/main/java/org/elasticsearch/gradle/internal/InternalDistributionDownloadPlugin.java b/...l/src/main/java/org/elasticsearch/gradle/internal/InternalDistributionDownloadPlugin.java
@@ -172,9 +172,6 @@ private static String distributionProjectName(ElasticsearchDistribution distribu
         if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_IRONBANK) {
             return projectName + "ironbank-docker" + archString + "-export";
         }
-        if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_CLOUD) {
-            return projectName + "cloud-docker" + archString + "-export";
-        }
         if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_CLOUD_ESS) {
             return projectName + "cloud-ess-docker" + archString + "-export";
         }

diff --git a/.../elasticsearch/gradle/internal/distribution/DockerCloudElasticsearchDistributionType.java b/.../elasticsearch/gradle/internal/distribution/DockerCloudElasticsearchDistributionType.java
diff --git a/...rg/elasticsearch/gradle/internal/distribution/InternalElasticsearchDistributionTypes.java b/...rg/elasticsearch/gradle/internal/distribution/InternalElasticsearchDistributionTypes.java
@@ -19,7 +19,6 @@ public class InternalElasticsearchDistributionTypes {
     public static ElasticsearchDistributionType DOCKER = new DockerElasticsearchDistributionType();
     public static ElasticsearchDistributionType DOCKER_UBI = new DockerUbiElasticsearchDistributionType();
     public static ElasticsearchDistributionType DOCKER_IRONBANK = new DockerIronBankElasticsearchDistributionType();
-    public static ElasticsearchDistributionType DOCKER_CLOUD = new DockerCloudElasticsearchDistributionType();
     public static ElasticsearchDistributionType DOCKER_CLOUD_ESS = new DockerCloudEssElasticsearchDistributionType();
     public static ElasticsearchDistributionType DOCKER_WOLFI = new DockerWolfiElasticsearchDistributionType();
 
@@ -29,7 +28,6 @@ public class InternalElasticsearchDistributionTypes {
         DOCKER,
         DOCKER_UBI,
         DOCKER_IRONBANK,
-        DOCKER_CLOUD,
         DOCKER_CLOUD_ESS,
         DOCKER_WOLFI
     );

diff --git a/...tools-internal/src/main/java/org/elasticsearch/gradle/internal/test/DistroTestPlugin.java b/...tools-internal/src/main/java/org/elasticsearch/gradle/internal/test/DistroTestPlugin.java
@@ -49,7 +49,6 @@
 import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.ALL_INTERNAL;
 import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DEB;
 import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER;
-import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_CLOUD;
 import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_CLOUD_ESS;
 import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_IRONBANK;
 import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_UBI;
@@ -149,7 +148,6 @@ private static Map<ElasticsearchDistributionType, TaskProvider<?>> lifecycleTask
         lifecyleTasks.put(DOCKER, project.getTasks().register(taskPrefix + ".docker"));
         lifecyleTasks.put(DOCKER_UBI, project.getTasks().register(taskPrefix + ".docker-ubi"));
         lifecyleTasks.put(DOCKER_IRONBANK, project.getTasks().register(taskPrefix + ".docker-ironbank"));
-        lifecyleTasks.put(DOCKER_CLOUD, project.getTasks().register(taskPrefix + ".docker-cloud"));
         lifecyleTasks.put(DOCKER_CLOUD_ESS, project.getTasks().register(taskPrefix + ".docker-cloud-ess"));
         lifecyleTasks.put(DOCKER_WOLFI, project.getTasks().register(taskPrefix + ".docker-wolfi"));
         lifecyleTasks.put(ARCHIVE, project.getTasks().register(taskPrefix + ".archives"));

diff --git a/build-tools-internal/src/main/resources/checkstyle_suppressions.xml b/build-tools-internal/src/main/resources/checkstyle_suppressions.xml
@@ -35,6 +35,8 @@
   <!-- Intentionally have multi line string for a bulk request, otherwise this needs to fallback to string concatenation  -->
   <suppress files="modules[/\\]data-streams[/\\]src[/\\]javaRestTest[/\\]java[/\\]org[/\\]elasticsearch[/\\]datastreams[/\\]TsdbDataStreamRestIT.java" checks="LineLength" />
   <suppress files="qa[/\\]rolling-upgrade[/\\]src[/\\]javaRestTest[/\\]java[/\\]org[/\\]elasticsearch[/\\]upgrades[/\\]TsdbIT.java" checks="LineLength" />
+  <suppress files="qa[/\\]rolling-upgrade[/\\]src[/\\]javaRestTest[/\\]java[/\\]org[/\\]elasticsearch[/\\]upgrades[/\\]TsdbIndexingRollingUpgradeIT.java" checks="LineLength" />
+  <suppress files="qa[/\\]rolling-upgrade[/\\]src[/\\]javaRestTest[/\\]java[/\\]org[/\\]elasticsearch[/\\]upgrades[/\\]LogsdbIndexingRollingUpgradeIT.java" checks="LineLength" />
 
   <!-- Gradle requires inputs to be seriablizable -->
   <suppress files="build-tools-internal[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]gradle[/\\]internal[/\\]precommit[/\\]TestingConventionRule.java" checks="RegexpSinglelineJava" />

diff --git a/docs/changelog/113563.yaml b/docs/changelog/113563.yaml
diff --git a/docs/reference/cluster/voting-exclusions.asciidoc b/docs/reference/cluster/voting-exclusions.asciidoc
@@ -7,7 +7,6 @@
 Adds or removes master-eligible nodes from the
 <<modules-discovery-voting,voting configuration exclusion list>>.
 
-
 [[voting-config-exclusions-api-request]]
 ==== {api-request-title}
 
@@ -28,7 +27,7 @@ users can use this API.
 
 [[voting-config-exclusions-api-desc]]
 ==== {api-description-title}
-  
+
 By default, if there are more than three master-eligible nodes in the cluster
 and you remove fewer than half of the master-eligible nodes in the cluster at
 once, the <<modules-discovery-voting,voting configuration>> automatically
@@ -50,14 +49,19 @@ use `DELETE /_cluster/voting_config_exclusions?wait_for_removal=false` to clear
 the voting configuration exclusions without waiting for the nodes to leave the
 cluster.
 
-If the API fails, you can safely retry it. Only a successful response
-guarantees that the node has been removed from the voting configuration and
-will not be reinstated.
+A response to `POST /_cluster/voting_config_exclusions` with an HTTP status
+code of `200 OK` guarantees that the node has been removed from the voting
+configuration and will not be reinstated until the voting configuration
+exclusions are cleared by calling `DELETE /_cluster/voting_config_exclusions`.
+If the call to `POST /_cluster/voting_config_exclusions` fails or returns a
+response with an HTTP status code other than `200 OK` then the node may not
+have been removed from the voting configuration. In that case, you may safely
+retry the call.
 
 NOTE: Voting exclusions are required only when you remove at least half of the
 master-eligible nodes from a cluster in a short time period. They are not
-required when removing master-ineligible nodes or fewer than half of the
-master-eligible nodes.
+required when removing master-ineligible nodes or when removing fewer than half
+of the master-eligible nodes.
 
 For more information, see <<modules-discovery-removing-nodes>>.
 
@@ -94,15 +98,15 @@ list. Defaults to `true`, meaning that all excluded nodes must be removed from
 the cluster before this API takes any action. If set to `false` then the voting
 configuration exclusions list is cleared even if some excluded nodes are still
 in the cluster. Only applies to the `DELETE` form of this API.
-  
+
 [[voting-config-exclusions-api-example]]
 ==== {api-examples-title}
 
 Adds nodes named `nodeName1` and `nodeName2` to the voting configuration
 exclusions list:
 
 [source,console]
--------------------------------------------------- 
+--------------------------------------------------
 POST /_cluster/voting_config_exclusions?node_names=nodeName1,nodeName2
 --------------------------------------------------
 

diff --git a/docs/reference/connector/docs/connectors-API-tutorial.asciidoc b/docs/reference/connector/docs/connectors-API-tutorial.asciidoc
@@ -367,7 +367,7 @@ Refer to the individual connectors-references,connector references for these con
 ====
 We're using a self-managed connector in this tutorial.
 To use these APIs with an Elastic managed connector, there's some extra setup for API keys.
-Refer to native-connectors-manage-API-keys for details.
+Refer to <<es-native-connectors-manage-API-keys>> for details.
 ====
 
 We're now ready to sync our PostgreSQL data to {es}.

diff --git a/docs/reference/connector/docs/connectors-servicenow.asciidoc b/docs/reference/connector/docs/connectors-servicenow.asciidoc
@@ -81,15 +81,15 @@ Comma-separated list of services to fetch data from ServiceNow. If the value is
 - link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/incident-management/concept/c_IncidentManagement.html[Incident]
 - link:https://docs.servicenow.com/bundle/tokyo-servicenow-platform/page/use/service-catalog-requests/task/t_AddNewRequestItems.html[Requested Item]
 - link:https://docs.servicenow.com/bundle/tokyo-customer-service-management/page/product/customer-service-management/task/t_SearchTheKnowledgeBase.html[Knowledge]
-- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/change-management/task/t_CreateAChange.html[Change Request]
+- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/change-management/task/t_CreateAChange.html[Change request]
 +
 [NOTE]
 ====
 If you have configured a custom service, the `*` value will not fetch data from the basic services above by default. In this case you'll need to mention these service names explicitly.
 ====
 Default value is `*`. Examples:
 +
-  - `User, Incident, Requested Item, Knowledge, Change Request`
+  - `User, Incident, Requested Item, Knowledge, Change request`
   - `*`
 
 Enable document level security::
@@ -139,7 +139,7 @@ For default services, connectors use the following roles to find users who have
 
 | Knowledge | `admin`, `knowledge`, `knowledge_manager`, `knowledge_admin` 
 
-| Change Request | `admin`, `sn_change_read`, `itil` 
+| Change request | `admin`, `sn_change_read`, `itil` 
 |===
 
 For services other than these defaults, the connector iterates over access controls with `read` operations and finds the respective roles for those services.
@@ -305,15 +305,15 @@ Comma-separated list of services to fetch data from ServiceNow. If the value is
 - link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/incident-management/concept/c_IncidentManagement.html[Incident]
 - link:https://docs.servicenow.com/bundle/tokyo-servicenow-platform/page/use/service-catalog-requests/task/t_AddNewRequestItems.html[Requested Item]
 - link:https://docs.servicenow.com/bundle/tokyo-customer-service-management/page/product/customer-service-management/task/t_SearchTheKnowledgeBase.html[Knowledge]
-- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/change-management/task/t_CreateAChange.html[Change Request]
+- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/change-management/task/t_CreateAChange.html[Change request]
 +
 [NOTE]
 ====
 If you have configured a custom service, the `*` value will not fetch data from the basic services above by default. In this case you'll need to mention these service names explicitly.
 ====
 Default value is `*`. Examples:
 +
-  - `User, Incident, Requested Item, Knowledge, Change Request`
+  - `User, Incident, Requested Item, Knowledge, Change request`
   - `*`
 
 `retry_count`::
@@ -374,7 +374,7 @@ For default services, connectors use the following roles to find users who have
 
 | Knowledge | `admin`, `knowledge`, `knowledge_manager`, `knowledge_admin` 
 
-| Change Request | `admin`, `sn_change_read`, `itil` 
+| Change request | `admin`, `sn_change_read`, `itil` 
 |===
 
 For services other than these defaults, the connector iterates over access controls with `read` operations and finds the respective roles for those services.

diff --git a/docs/reference/inference/inference-apis.asciidoc b/docs/reference/inference/inference-apis.asciidoc
@@ -35,7 +35,6 @@ Elastic –, then create an {infer} endpoint by the <<put-inference-api>>.
 Now use <<semantic-search-semantic-text, semantic text>> to perform
 <<semantic-search, semantic search>> on your data.
 
-
 [discrete]
 [[default-enpoints]]
 === Default {infer} endpoints
@@ -53,6 +52,67 @@ For these models, the minimum number of allocations is `0`.
 If there is no {infer} activity that uses the endpoint, the number of allocations will scale down to `0` automatically after 15 minutes.
 
 
+[discrete]
+[[infer-chunking-config]]
+=== Configuring chunking
+
+{infer-cap} endpoints have a limit on the amount of text they can process at once, determined by the model's input capacity.
+Chunking is the process of splitting the input text into pieces that remain within these limits.
+It occurs when ingesting documents into <<semantic-text,`semantic_text` fields>>.
+Chunking also helps produce sections that are digestible for humans.
+Returning a long document in search results is less useful than providing the most relevant chunk of text.
+
+Each chunk will include the text subpassage and the corresponding embedding generated from it.
+
+By default, documents are split into sentences and grouped in sections up to 250 words with 1 sentence overlap so that each chunk shares a sentence with the previous chunk.
+Overlapping ensures continuity and prevents vital contextual information in the input text from being lost by a hard break. 
+
+{es} uses the https://unicode-org.github.io/icu-docs/[ICU4J] library to detect word and sentence boundaries for chunking.
+https://unicode-org.github.io/icu/userguide/boundaryanalysis/#word-boundary[Word boundaries] are identified by following a series of rules, not just the presence of a whitespace character.
+For written languages that do use whitespace such as Chinese or Japanese dictionary lookups are used to detect word boundaries.
+
+
+[discrete]
+==== Chunking strategies
+
+Two strategies are available for chunking: `sentence` and `word`.
+
+The `sentence` strategy splits the input text at sentence boundaries.
+Each chunk contains one or more complete sentences ensuring that the integrity of sentence-level context is preserved, except when a sentence causes a chunk to exceed a word count of `max_chunk_size`, in which case it will be split across chunks.
+The `sentence_overlap` option defines the number of sentences from the previous chunk to include in the current chunk which is either `0` or `1`.
+
+The `word` strategy splits the input text on individual words up to the `max_chunk_size` limit.
+The `overlap` option is the number of words from the previous chunk to include in the current chunk.
+
+The default chunking strategy is `sentence`.
+
+NOTE: The default chunking strategy for {infer} endpoints created before 8.16 is `word`.
+
+
+[discrete]
+==== Example of configuring the chunking behavior
+
+The following example creates an {infer} endpoint with the `elasticsearch` service that deploys the ELSER model by default and configures the chunking behavior.
+
+[source,console]
+------------------------------------------------------------
+PUT _inference/sparse_embedding/small_chunk_size
+{
+  "service": "elasticsearch",
+  "service_settings": {
+    "num_allocations": 1,
+    "num_threads": 1
+  },
+  "chunking_settings": {
+    "strategy": "sentence",
+    "max_chunk_size": 100,
+    "sentence_overlap": 0
+  }
+}
+------------------------------------------------------------
+// TEST[skip:TBD]
+
+
 include::delete-inference.asciidoc[]
 include::get-inference.asciidoc[]
 include::post-inference.asciidoc[]

diff --git a/docs/reference/inference/inference-shared.asciidoc b/docs/reference/inference/inference-shared.asciidoc
@@ -31,4 +31,36 @@ end::task-settings[]
 
 tag::task-type[]
 The type of the {infer} task that the model will perform.
-end::task-type[]
+end::task-type[]
+
+tag::chunking-settings[]
+Chunking configuration object.
+Refer to <<infer-chunking-config>> to learn more about chunking.
+end::chunking-settings[]
+
+tag::chunking-settings-max-chunking-size[]
+Specifies the maximum size of a chunk in words.
+Defaults to `250`.
+This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy). 
+end::chunking-settings-max-chunking-size[]
+
+tag::chunking-settings-overlap[]
+Only for `word` chunking strategy.
+Specifies the number of overlapping words for chunks.
+Defaults to `100`.
+This value cannot be higher than the half of `max_chunking_size`.
+end::chunking-settings-overlap[]
+
+tag::chunking-settings-sentence-overlap[]
+Only for `sentence` chunking strategy.
+Specifies the numnber of overlapping sentences for chunks.
+It can be either `1` or `0`.
+Defaults to `1`.
+end::chunking-settings-sentence-overlap[]
+
+tag::chunking-settings-strategy[]
+Specifies the chunking strategy.
+It could be either `sentence` or `word`.
+end::chunking-settings-strategy[]
+
+