Merge remote-tracking branch 'origin/main' into carlosdelest/semantic…

…-text-new-ingestion-inference
carlosdelest · May 14, 2024 · ac31f7b · ac31f7b
2 parents 199d4ba + 4be03e9
commit ac31f7b
Show file tree

Hide file tree

Showing 409 changed files with 5,484 additions and 2,694 deletions.
diff --git a/.buildkite/pipelines/periodic.template.yml b/.buildkite/pipelines/periodic.template.yml
@@ -88,6 +88,7 @@ steps:
               - openjdk17
               - openjdk21
               - openjdk22
+              - openjdk23
             GRADLE_TASK:
               - checkPart1
               - checkPart2
@@ -113,6 +114,7 @@ steps:
               - openjdk17
               - openjdk21
               - openjdk22
+              - openjdk23
             BWC_VERSION: $BWC_LIST
         agents:
           provider: gcp

diff --git a/.buildkite/pipelines/periodic.yml b/.buildkite/pipelines/periodic.yml
@@ -735,6 +735,7 @@ steps:
               - openjdk17
               - openjdk21
               - openjdk22
+              - openjdk23
             GRADLE_TASK:
               - checkPart1
               - checkPart2
@@ -760,6 +761,7 @@ steps:
               - openjdk17
               - openjdk21
               - openjdk22
+              - openjdk23
             BWC_VERSION: ["7.17.22", "8.13.5", "8.14.0", "8.15.0"]
         agents:
           provider: gcp

diff --git a/distribution/packages/src/deb/lintian/elasticsearch b/distribution/packages/src/deb/lintian/elasticsearch
@@ -59,3 +59,7 @@ unknown-field License
 # don't build them ourselves and the license precludes us modifying them
 # to fix this.
 library-not-linked-against-libc usr/share/elasticsearch/modules/x-pack-ml/platform/linux-x86_64/lib/libmkl_*.so
+
+# shared-lib-without-dependency-information (now shared-library-lacks-prerequisites) is falsely reported for libvec.so
+# which has no dependencies (not even libc) besides the symbols in the base executable.
+shared-lib-without-dependency-information usr/share/elasticsearch/lib/platform/linux-x64/libvec.so
diff --git a/...ibution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/SystemJvmOptions.java b/...ibution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/SystemJvmOptions.java
@@ -77,6 +77,7 @@ static List<String> systemJvmOptions(Settings nodeSettings, final Map<String, St
             maybeEnableNativeAccess(),
             maybeOverrideDockerCgroup(distroType),
             maybeSetActiveProcessorCount(nodeSettings),
+            maybeWorkaroundG1Bug(),
             setReplayFile(distroType, isHotspot),
             "-Djava.library.path=" + libraryPath,
             "-Djna.library.path=" + libraryPath,
@@ -137,6 +138,17 @@ private static String maybeEnableNativeAccess() {
         return "";
     }
 
+    /*
+     * Only affects 22 and 22.0.1, see https://bugs.openjdk.org/browse/JDK-8329528
+     */
+    private static String maybeWorkaroundG1Bug() {
+        Runtime.Version v = Runtime.version();
+        if (v.feature() == 22 && v.update() <= 1) {
+            return "-XX:+UnlockDiagnosticVMOptions -XX:G1NumCollectionsKeepPinned=10000000";
+        }
+        return "";
+    }
+
     private static String findLibraryPath(Map<String, String> sysprops) {
         // working dir is ES installation, so we use relative path here
         Path platformDir = Paths.get("lib", "platform");

diff --git a/docs/build.gradle b/docs/build.gradle
@@ -1752,6 +1752,7 @@ setups['setup-snapshots'] = setups['setup-repository'] + '''
         name: "my_admin_role"
         body: >
           {
+            "description": "Grants full access to all management features within the cluster.",
             "cluster": ["all"],
             "indices": [
               {"names": ["index1", "index2" ], "privileges": ["all"], "field_security" : {"grant" : [ "title", "body" ]}}

diff --git a/docs/changelog/106486.yaml b/docs/changelog/106486.yaml
@@ -0,0 +1,17 @@
+pr: 106486
+summary: Create custom parser for ISO-8601 datetimes
+area: Infra/Core
+type: enhancement
+issues:
+  - 102063
+highlight:
+  title: New custom parser for ISO-8601 datetimes
+  body: |-
+    This introduces a new custom parser for ISO-8601 datetimes, for the `iso8601`, `strict_date_optional_time`, and
+    `strict_date_optional_time_nanos` built-in date formats. This provides a performance improvement over the
+    default Java date-time parsing. Whilst it maintains much of the same behaviour,
+    the new parser does not accept nonsensical date-time strings that have multiple fractional seconds fields
+    or multiple timezone specifiers. If the new parser fails to parse a string, it will then use the previous parser
+    to parse it. If a large proportion of the input data consists of these invalid strings, this may cause
+    a small performance degradation. If you wish to force the use of the old parsers regardless,
+    set the JVM property `es.datetime.java_time_parsers=true` on all ES nodes.
diff --git a/docs/changelog/107886.yaml b/docs/changelog/107886.yaml
@@ -0,0 +1,5 @@
+pr: 107886
+summary: Cluster state role mapper file settings service
+area: Authorization
+type: enhancement
+issues: []
diff --git a/docs/changelog/108452.yaml b/docs/changelog/108452.yaml
@@ -0,0 +1,5 @@
+pr: 108452
+summary: Add the rerank task to the Elasticsearch internal inference service
+area: Machine Learning
+type: enhancement
+issues: []
diff --git a/docs/changelog/108517.yaml b/docs/changelog/108517.yaml
@@ -0,0 +1,6 @@
+pr: 108517
+summary: Forward `indexServiceSafe` exception to listener
+area: Transform
+type: bug
+issues:
+ - 108418
diff --git a/docs/changelog/108518.yaml b/docs/changelog/108518.yaml
@@ -0,0 +1,5 @@
+pr: 108518
+summary: Remove leading is_ prefix from Enterprise geoip docs
+area: Ingest Node
+type: bug
+issues: []
diff --git a/docs/changelog/108521.yaml b/docs/changelog/108521.yaml
@@ -0,0 +1,6 @@
+pr: 108521
+summary: Adding override for lintian false positive on `libvec.so`
+area: "Packaging"
+type: bug
+issues:
+ - 108514
diff --git a/docs/changelog/108522.yaml b/docs/changelog/108522.yaml
@@ -0,0 +1,5 @@
+pr: 108522
+summary: Ensure we return non-negative scores when scoring scalar dot-products
+area: Vector Search
+type: bug
+issues: []
diff --git a/docs/changelog/108562.yaml b/docs/changelog/108562.yaml
@@ -0,0 +1,6 @@
+pr: 108562
+summary: Add `internalClusterTest` for and fix leak in `ExpandSearchPhase`
+area: Search
+type: bug
+issues:
+ - 108369
diff --git a/docs/changelog/108571.yaml b/docs/changelog/108571.yaml
@@ -0,0 +1,5 @@
+pr: 108571
+summary: Workaround G1 bug for JDK 22 and 22.0.1
+area: Infra/CLI
+type: bug
+issues: []
diff --git a/docs/internal/DistributedArchitectureGuide.md b/docs/internal/DistributedArchitectureGuide.md
@@ -1,6 +1,14 @@
-# Distributed Area Team Internals
+# Distributed Area Internals
 
-(Summary, brief discussion of our features)
+The Distributed Area contains indexing and coordination systems.
+
+The index path stretches from the user REST command through shard routing down to each individual shard's translog and storage
+engine. Reindexing is effectively reading from a source index and writing to a destination index (perhaps on different nodes).
+The coordination side includes cluster coordination, shard allocation, cluster autoscaling stats, task management, and cross
+cluster replication. Less obvious coordination systems include networking, the discovery plugin system, the snapshot/restore
+logic, and shard recovery.
+
+A guide to the general Elasticsearch components can be found [here](https://github.com/elastic/elasticsearch/blob/main/docs/internal/GeneralArchitectureGuide.md).
 
 # Networking
 
@@ -237,9 +245,101 @@ works in parallel with the storage engine.)
 
 # Autoscaling
 
-(Reactive and proactive autoscaling. Explain that we surface recommendations, how control plane uses it.)
-
-(Sketch / list the different deciders that we have, and then also how we use information from each to make a recommendation.)
+The Autoscaling API in ES (Elasticsearch) uses cluster and node level statistics to provide a recommendation
+for a cluster size to support the current cluster data and active workloads. ES Autoscaling is paired
+with an ES Cloud service that periodically polls the ES elected master node for suggested cluster
+changes. The cloud service will add more resources to the cluster based on Elasticsearch's recommendation.
+Elasticsearch by itself cannot automatically scale.
+
+Autoscaling recommendations are tailored for the user [based on user defined policies][], composed of data
+roles (hot, frozen, etc) and [deciders][]. There's a public [webinar on autoscaling][], as well as the
+public [Autoscaling APIs] docs.
+
+Autoscaling's current implementation is based primary on storage requirements, as well as memory capacity
+for ML and frozen tier. It does not yet support scaling related to search load. Paired with ES Cloud,
+autoscaling only scales upward, not downward, except for ML nodes that do get scaled up _and_ down.
+
+[based on user defined policies]: https://www.elastic.co/guide/en/elasticsearch/reference/current/xpack-autoscaling.html
+[deciders]: https://www.elastic.co/guide/en/elasticsearch/reference/current/autoscaling-deciders.html
+[webinar on autoscaling]: https://www.elastic.co/webinars/autoscaling-from-zero-to-production-seamlessly
+[Autoscaling APIs]: https://www.elastic.co/guide/en/elasticsearch/reference/current/autoscaling-apis.html
+
+### Plugin REST and TransportAction entrypoints
+
+Autoscaling is a [plugin][]. All the REST APIs can be found in [autoscaling/rest/][].
+`GetAutoscalingCapacityAction` is the capacity calculation operation REST endpoint, as opposed to the
+other rest commands that get/set/delete the policies guiding the capacity calculation. The Transport
+Actions can be found in [autoscaling/action/], where [TransportGetAutoscalingCapacityAction][] is the
+entrypoint on the master node for calculating the optimal cluster resources based on the autoscaling
+policies.
+
+[plugin]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/Autoscaling.java#L72
+[autoscaling/rest/]: https://github.com/elastic/elasticsearch/tree/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/rest
+[autoscaling/action/]: https://github.com/elastic/elasticsearch/tree/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/action
+[TransportGetAutoscalingCapacityAction]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/action/TransportGetAutoscalingCapacityAction.java#L82-L98
+
+### How cluster capacity is determined
+
+[AutoscalingMetadata][] implements [Metadata.Custom][] in order to persist autoscaling policies. Each
+Decider is an implementation of [AutoscalingDeciderService][]. The [AutoscalingCalculateCapacityService][]
+is responsible for running the calculation.
+
+[TransportGetAutoscalingCapacityAction.computeCapacity] is the entry point to [AutoscalingCalculateCapacityService.calculate],
+which creates a [AutoscalingDeciderResults][] for [each autoscaling policy][]. [AutoscalingDeciderResults.toXContent][] then
+determines the [maximum required capacity][] to return to the caller. [AutoscalingCapacity][] is the base unit of a cluster
+resources recommendation.
+
+The `TransportGetAutoscalingCapacityAction` response is cached to prevent concurrent callers
+overloading the system: the operation is expensive. `TransportGetAutoscalingCapacityAction` contains
+a [CapacityResponseCache][]. `TransportGetAutoscalingCapacityAction.masterOperation`
+calls [through the CapacityResponseCache][], into the `AutoscalingCalculateCapacityService`, to handle
+concurrent callers.
+
+[AutoscalingMetadata]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/AutoscalingMetadata.java#L38
+[Metadata.Custom]: https://github.com/elastic/elasticsearch/blob/v8.13.2/server/src/main/java/org/elasticsearch/cluster/metadata/Metadata.java#L141-L145
+[AutoscalingDeciderService]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/capacity/AutoscalingDeciderService.java#L16-L19
+[AutoscalingCalculateCapacityService]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/capacity/AutoscalingCalculateCapacityService.java#L43
+
+[TransportGetAutoscalingCapacityAction.computeCapacity]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/action/TransportGetAutoscalingCapacityAction.java#L102-L108
+[AutoscalingCalculateCapacityService.calculate]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/capacity/AutoscalingCalculateCapacityService.java#L108-L139
+[AutoscalingDeciderResults]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/capacity/AutoscalingDeciderResults.java#L34-L38
+[each autoscaling policy]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/capacity/AutoscalingCalculateCapacityService.java#L124-L131
+[AutoscalingDeciderResults.toXContent]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/capacity/AutoscalingDeciderResults.java#L78
+[maximum required capacity]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/capacity/AutoscalingDeciderResults.java#L105-L116
+[AutoscalingCapacity]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/capacity/AutoscalingCapacity.java#L27-L35
+
+[CapacityResponseCache]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/action/TransportGetAutoscalingCapacityAction.java#L44-L47
+[through the CapacityResponseCache]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/action/TransportGetAutoscalingCapacityAction.java#L97
+
+### Where the data comes from
+
+The Deciders each pull data from different sources as needed to inform their decisions. The
+[DiskThresholdMonitor][] is one such data source. The Monitor runs on the master node and maintains
+lists of nodes that exceed various disk size thresholds. [DiskThresholdSettings][] contains the
+threshold settings with which the `DiskThresholdMonitor` runs.
+
+[DiskThresholdMonitor]: https://github.com/elastic/elasticsearch/blob/v8.13.2/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java#L53-L58
+[DiskThresholdSettings]: https://github.com/elastic/elasticsearch/blob/v8.13.2/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java#L24-L27
+
+### Deciders
+
+The `ReactiveStorageDeciderService` tracks information that demonstrates storage limitations are causing
+problems in the cluster. It uses [an algorithm defined here][]. Some examples are
+- information from the `DiskThresholdMonitor` to find out whether nodes are exceeding their storage capacity
+- number of unassigned shards that failed allocation because of insufficient storage
+- the max shard size and minimum node size, and whether these can be satisfied with the existing infrastructure
+
+[an algorithm defined here]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java#L158-L176
+
+The `ProactiveStorageDeciderService` maintains a forecast window that [defaults to 30 minutes][]. It only
+runs on data streams (ILM, rollover, etc), not regular indexes. It looks at past [index changes][] that
+took place within the forecast window to [predict][] resources that will be needed shortly.
+
+[defaults to 30 minutes]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ProactiveStorageDeciderService.java#L32
+[index changes]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ProactiveStorageDeciderService.java#L79-L83
+[predict]: https://github.com/elastic/elasticsearch/blob/v8.13.2/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ProactiveStorageDeciderService.java#L85-L95
+
+There are several more Decider Services, implementing the `AutoscalingDeciderService` interface.
 
 # Snapshot / Restore
 

diff --git a/docs/reference/connector/apis/list-connector-sync-jobs-api.asciidoc b/docs/reference/connector/apis/list-connector-sync-jobs-api.asciidoc
@@ -30,13 +30,13 @@ Returns information about all stored connector sync jobs ordered by their creati
 (Optional, integer) The offset from the first result to fetch. Defaults to `0`.
 
 `status`::
-(Optional, job status) The job status the fetched sync jobs need to have.
+(Optional, job status) A comma-separated list of job statuses to filter the results. Available statuses include: `canceling`, `canceled`, `completed`, `error`, `in_progress`, `pending`, `suspended`.
 
 `connector_id`::
 (Optional, string) The connector id the fetched sync jobs need to have.
 
 `job_type`::
-(Optional, job type) A comma-separated list of job types.
+(Optional, job type) A comma-separated list of job types. Available job types are: `full`, `incremental` and `access_control`.
 
 [[list-connector-sync-jobs-api-example]]
 ==== {api-examples-title}

diff --git a/docs/reference/data-streams/change-mappings-and-settings.asciidoc b/docs/reference/data-streams/change-mappings-and-settings.asciidoc
@@ -602,7 +602,7 @@ stream's oldest backing index.
 // TESTRESPONSE[s/"index_uuid": "_eEfRrFHS9OyhqWntkgHAQ"/"index_uuid": $body.data_streams.0.indices.1.index_uuid/]
 // TESTRESPONSE[s/"index_name": ".ds-my-data-stream-2099.03.07-000001"/"index_name": $body.data_streams.0.indices.0.index_name/]
 // TESTRESPONSE[s/"index_name": ".ds-my-data-stream-2099.03.08-000002"/"index_name": $body.data_streams.0.indices.1.index_name/]
-// TESTRESPONSE[s/"status": "GREEN"/"status": "YELLOW","failure_indices":[],"failure_store":false/]
+// TESTRESPONSE[s/"status": "GREEN"/"status": "YELLOW","failure_store":{"enabled": false, "indices": [], "rollover_on_write": false}/]
 
 <1> First item in the `indices` array for `my-data-stream`. This item contains
 information about the stream's oldest backing index,

diff --git a/docs/reference/data-streams/downsampling-manual.asciidoc b/docs/reference/data-streams/downsampling-manual.asciidoc
@@ -389,7 +389,7 @@ This returns:
 // TESTRESPONSE[s/"ltOJGmqgTVm4T-Buoe7Acg"/$body.data_streams.0.indices.0.index_uuid/]
 // TESTRESPONSE[s/"2023-07-26T09:26:42.000Z"/$body.data_streams.0.time_series.temporal_ranges.0.start/]
 // TESTRESPONSE[s/"2023-07-26T13:26:42.000Z"/$body.data_streams.0.time_series.temporal_ranges.0.end/]
-// TESTRESPONSE[s/"replicated": false/"replicated": false,"failure_indices":[],"failure_store":false/]
+// TESTRESPONSE[s/"replicated": false/"replicated": false,"failure_store":{"enabled": false, "indices": [], "rollover_on_write": false}/]
 <1> The backing index for this data stream.
 
 Before a backing index can be downsampled, the TSDS needs to be rolled over and