Skip to content

Commit

Permalink
Merge branch 'main' of github.com:elastic/elasticsearch into batch-th…
Browse files Browse the repository at this point in the history
…e-chunks
  • Loading branch information
jan-elastic committed Oct 28, 2024
2 parents 855790b + 98cd34f commit 19deea7
Show file tree
Hide file tree
Showing 97 changed files with 4,062 additions and 1,663 deletions.
35 changes: 34 additions & 1 deletion build-tools-internal/src/main/groovy/elasticsearch.ide.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,36 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') {
.findAll { it != null }
}

// force IntelliJ to generate *.iml files for each imported module
tasks.register("enableExternalConfiguration") {
group = 'ide'
description = 'Enable per-module *.iml files'

doLast {
modifyXml('.idea/misc.xml') {xml ->
def externalStorageConfig = xml.component.find { it.'@name' == 'ExternalStorageConfigurationManager' }
if (externalStorageConfig) {
xml.remove(externalStorageConfig)
}
}
}
}

// modifies the idea module config to enable preview features on 'elasticsearch-native' module
tasks.register("enablePreviewFeatures") {
group = 'ide'
description = 'Enables preview features on native library module'
dependsOn tasks.named("enableExternalConfiguration")

doLast {
['main', 'test'].each { sourceSet ->
modifyXml(".idea/modules/libs/native/elasticsearch.libs.elasticsearch-native.${sourceSet}.iml") { xml ->
xml.component.find { it.'@name' == 'NewModuleRootManager' }?.'@LANGUAGE_LEVEL' = 'JDK_21_PREVIEW'
}
}
}
}

tasks.register('buildDependencyArtifacts') {
group = 'ide'
description = 'Builds artifacts needed as dependency for IDE modules'
Expand Down Expand Up @@ -149,7 +179,10 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') {
testRunner = 'choose_per_test'
}
taskTriggers {
afterSync tasks.named('configureIdeCheckstyle'), tasks.named('configureIdeaGradleJvm'), tasks.named('buildDependencyArtifacts')
afterSync tasks.named('configureIdeCheckstyle'),
tasks.named('configureIdeaGradleJvm'),
tasks.named('buildDependencyArtifacts'),
tasks.named('enablePreviewFeatures')
}
encodings {
encoding = 'UTF-8'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,6 @@ private static String distributionProjectName(ElasticsearchDistribution distribu
if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_IRONBANK) {
return projectName + "ironbank-docker" + archString + "-export";
}
if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_CLOUD) {
return projectName + "cloud-docker" + archString + "-export";
}
if (distribution.getType() == InternalElasticsearchDistributionTypes.DOCKER_CLOUD_ESS) {
return projectName + "cloud-ess-docker" + archString + "-export";
}
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ public class InternalElasticsearchDistributionTypes {
public static ElasticsearchDistributionType DOCKER = new DockerElasticsearchDistributionType();
public static ElasticsearchDistributionType DOCKER_UBI = new DockerUbiElasticsearchDistributionType();
public static ElasticsearchDistributionType DOCKER_IRONBANK = new DockerIronBankElasticsearchDistributionType();
public static ElasticsearchDistributionType DOCKER_CLOUD = new DockerCloudElasticsearchDistributionType();
public static ElasticsearchDistributionType DOCKER_CLOUD_ESS = new DockerCloudEssElasticsearchDistributionType();
public static ElasticsearchDistributionType DOCKER_WOLFI = new DockerWolfiElasticsearchDistributionType();

Expand All @@ -29,7 +28,6 @@ public class InternalElasticsearchDistributionTypes {
DOCKER,
DOCKER_UBI,
DOCKER_IRONBANK,
DOCKER_CLOUD,
DOCKER_CLOUD_ESS,
DOCKER_WOLFI
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.ALL_INTERNAL;
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DEB;
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER;
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_CLOUD;
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_CLOUD_ESS;
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_IRONBANK;
import static org.elasticsearch.gradle.internal.distribution.InternalElasticsearchDistributionTypes.DOCKER_UBI;
Expand Down Expand Up @@ -149,7 +148,6 @@ private static Map<ElasticsearchDistributionType, TaskProvider<?>> lifecycleTask
lifecyleTasks.put(DOCKER, project.getTasks().register(taskPrefix + ".docker"));
lifecyleTasks.put(DOCKER_UBI, project.getTasks().register(taskPrefix + ".docker-ubi"));
lifecyleTasks.put(DOCKER_IRONBANK, project.getTasks().register(taskPrefix + ".docker-ironbank"));
lifecyleTasks.put(DOCKER_CLOUD, project.getTasks().register(taskPrefix + ".docker-cloud"));
lifecyleTasks.put(DOCKER_CLOUD_ESS, project.getTasks().register(taskPrefix + ".docker-cloud-ess"));
lifecyleTasks.put(DOCKER_WOLFI, project.getTasks().register(taskPrefix + ".docker-wolfi"));
lifecyleTasks.put(ARCHIVE, project.getTasks().register(taskPrefix + ".archives"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
<!-- Intentionally have multi line string for a bulk request, otherwise this needs to fallback to string concatenation -->
<suppress files="modules[/\\]data-streams[/\\]src[/\\]javaRestTest[/\\]java[/\\]org[/\\]elasticsearch[/\\]datastreams[/\\]TsdbDataStreamRestIT.java" checks="LineLength" />
<suppress files="qa[/\\]rolling-upgrade[/\\]src[/\\]javaRestTest[/\\]java[/\\]org[/\\]elasticsearch[/\\]upgrades[/\\]TsdbIT.java" checks="LineLength" />
<suppress files="qa[/\\]rolling-upgrade[/\\]src[/\\]javaRestTest[/\\]java[/\\]org[/\\]elasticsearch[/\\]upgrades[/\\]TsdbIndexingRollingUpgradeIT.java" checks="LineLength" />
<suppress files="qa[/\\]rolling-upgrade[/\\]src[/\\]javaRestTest[/\\]java[/\\]org[/\\]elasticsearch[/\\]upgrades[/\\]LogsdbIndexingRollingUpgradeIT.java" checks="LineLength" />

<!-- Gradle requires inputs to be seriablizable -->
<suppress files="build-tools-internal[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]gradle[/\\]internal[/\\]precommit[/\\]TestingConventionRule.java" checks="RegexpSinglelineJava" />
Expand Down
5 changes: 0 additions & 5 deletions docs/changelog/113563.yaml

This file was deleted.

22 changes: 13 additions & 9 deletions docs/reference/cluster/voting-exclusions.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
Adds or removes master-eligible nodes from the
<<modules-discovery-voting,voting configuration exclusion list>>.


[[voting-config-exclusions-api-request]]
==== {api-request-title}

Expand All @@ -28,7 +27,7 @@ users can use this API.

[[voting-config-exclusions-api-desc]]
==== {api-description-title}

By default, if there are more than three master-eligible nodes in the cluster
and you remove fewer than half of the master-eligible nodes in the cluster at
once, the <<modules-discovery-voting,voting configuration>> automatically
Expand All @@ -50,14 +49,19 @@ use `DELETE /_cluster/voting_config_exclusions?wait_for_removal=false` to clear
the voting configuration exclusions without waiting for the nodes to leave the
cluster.

If the API fails, you can safely retry it. Only a successful response
guarantees that the node has been removed from the voting configuration and
will not be reinstated.
A response to `POST /_cluster/voting_config_exclusions` with an HTTP status
code of `200 OK` guarantees that the node has been removed from the voting
configuration and will not be reinstated until the voting configuration
exclusions are cleared by calling `DELETE /_cluster/voting_config_exclusions`.
If the call to `POST /_cluster/voting_config_exclusions` fails or returns a
response with an HTTP status code other than `200 OK` then the node may not
have been removed from the voting configuration. In that case, you may safely
retry the call.

NOTE: Voting exclusions are required only when you remove at least half of the
master-eligible nodes from a cluster in a short time period. They are not
required when removing master-ineligible nodes or fewer than half of the
master-eligible nodes.
required when removing master-ineligible nodes or when removing fewer than half
of the master-eligible nodes.

For more information, see <<modules-discovery-removing-nodes>>.

Expand Down Expand Up @@ -94,15 +98,15 @@ list. Defaults to `true`, meaning that all excluded nodes must be removed from
the cluster before this API takes any action. If set to `false` then the voting
configuration exclusions list is cleared even if some excluded nodes are still
in the cluster. Only applies to the `DELETE` form of this API.

[[voting-config-exclusions-api-example]]
==== {api-examples-title}

Adds nodes named `nodeName1` and `nodeName2` to the voting configuration
exclusions list:

[source,console]
--------------------------------------------------
--------------------------------------------------
POST /_cluster/voting_config_exclusions?node_names=nodeName1,nodeName2
--------------------------------------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ Refer to the individual connectors-references,connector references for these con
====
We're using a self-managed connector in this tutorial.
To use these APIs with an Elastic managed connector, there's some extra setup for API keys.
Refer to native-connectors-manage-API-keys for details.
Refer to <<es-native-connectors-manage-API-keys>> for details.
====

We're now ready to sync our PostgreSQL data to {es}.
Expand Down
12 changes: 6 additions & 6 deletions docs/reference/connector/docs/connectors-servicenow.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,15 @@ Comma-separated list of services to fetch data from ServiceNow. If the value is
- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/incident-management/concept/c_IncidentManagement.html[Incident]
- link:https://docs.servicenow.com/bundle/tokyo-servicenow-platform/page/use/service-catalog-requests/task/t_AddNewRequestItems.html[Requested Item]
- link:https://docs.servicenow.com/bundle/tokyo-customer-service-management/page/product/customer-service-management/task/t_SearchTheKnowledgeBase.html[Knowledge]
- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/change-management/task/t_CreateAChange.html[Change Request]
- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/change-management/task/t_CreateAChange.html[Change request]
+
[NOTE]
====
If you have configured a custom service, the `*` value will not fetch data from the basic services above by default. In this case you'll need to mention these service names explicitly.
====
Default value is `*`. Examples:
+
- `User, Incident, Requested Item, Knowledge, Change Request`
- `User, Incident, Requested Item, Knowledge, Change request`
- `*`
Enable document level security::
Expand Down Expand Up @@ -139,7 +139,7 @@ For default services, connectors use the following roles to find users who have
| Knowledge | `admin`, `knowledge`, `knowledge_manager`, `knowledge_admin`
| Change Request | `admin`, `sn_change_read`, `itil`
| Change request | `admin`, `sn_change_read`, `itil`
|===
For services other than these defaults, the connector iterates over access controls with `read` operations and finds the respective roles for those services.
Expand Down Expand Up @@ -305,15 +305,15 @@ Comma-separated list of services to fetch data from ServiceNow. If the value is
- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/incident-management/concept/c_IncidentManagement.html[Incident]
- link:https://docs.servicenow.com/bundle/tokyo-servicenow-platform/page/use/service-catalog-requests/task/t_AddNewRequestItems.html[Requested Item]
- link:https://docs.servicenow.com/bundle/tokyo-customer-service-management/page/product/customer-service-management/task/t_SearchTheKnowledgeBase.html[Knowledge]
- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/change-management/task/t_CreateAChange.html[Change Request]
- link:https://docs.servicenow.com/bundle/tokyo-it-service-management/page/product/change-management/task/t_CreateAChange.html[Change request]
+
[NOTE]
====
If you have configured a custom service, the `*` value will not fetch data from the basic services above by default. In this case you'll need to mention these service names explicitly.
====
Default value is `*`. Examples:
+
- `User, Incident, Requested Item, Knowledge, Change Request`
- `User, Incident, Requested Item, Knowledge, Change request`
- `*`
`retry_count`::
Expand Down Expand Up @@ -374,7 +374,7 @@ For default services, connectors use the following roles to find users who have
| Knowledge | `admin`, `knowledge`, `knowledge_manager`, `knowledge_admin`
| Change Request | `admin`, `sn_change_read`, `itil`
| Change request | `admin`, `sn_change_read`, `itil`
|===
For services other than these defaults, the connector iterates over access controls with `read` operations and finds the respective roles for those services.
Expand Down
62 changes: 61 additions & 1 deletion docs/reference/inference/inference-apis.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ Elastic –, then create an {infer} endpoint by the <<put-inference-api>>.
Now use <<semantic-search-semantic-text, semantic text>> to perform
<<semantic-search, semantic search>> on your data.


[discrete]
[[default-enpoints]]
=== Default {infer} endpoints
Expand All @@ -53,6 +52,67 @@ For these models, the minimum number of allocations is `0`.
If there is no {infer} activity that uses the endpoint, the number of allocations will scale down to `0` automatically after 15 minutes.


[discrete]
[[infer-chunking-config]]
=== Configuring chunking

{infer-cap} endpoints have a limit on the amount of text they can process at once, determined by the model's input capacity.
Chunking is the process of splitting the input text into pieces that remain within these limits.
It occurs when ingesting documents into <<semantic-text,`semantic_text` fields>>.
Chunking also helps produce sections that are digestible for humans.
Returning a long document in search results is less useful than providing the most relevant chunk of text.

Each chunk will include the text subpassage and the corresponding embedding generated from it.

By default, documents are split into sentences and grouped in sections up to 250 words with 1 sentence overlap so that each chunk shares a sentence with the previous chunk.
Overlapping ensures continuity and prevents vital contextual information in the input text from being lost by a hard break.

{es} uses the https://unicode-org.github.io/icu-docs/[ICU4J] library to detect word and sentence boundaries for chunking.
https://unicode-org.github.io/icu/userguide/boundaryanalysis/#word-boundary[Word boundaries] are identified by following a series of rules, not just the presence of a whitespace character.
For written languages that do use whitespace such as Chinese or Japanese dictionary lookups are used to detect word boundaries.


[discrete]
==== Chunking strategies

Two strategies are available for chunking: `sentence` and `word`.

The `sentence` strategy splits the input text at sentence boundaries.
Each chunk contains one or more complete sentences ensuring that the integrity of sentence-level context is preserved, except when a sentence causes a chunk to exceed a word count of `max_chunk_size`, in which case it will be split across chunks.
The `sentence_overlap` option defines the number of sentences from the previous chunk to include in the current chunk which is either `0` or `1`.

The `word` strategy splits the input text on individual words up to the `max_chunk_size` limit.
The `overlap` option is the number of words from the previous chunk to include in the current chunk.

The default chunking strategy is `sentence`.

NOTE: The default chunking strategy for {infer} endpoints created before 8.16 is `word`.


[discrete]
==== Example of configuring the chunking behavior

The following example creates an {infer} endpoint with the `elasticsearch` service that deploys the ELSER model by default and configures the chunking behavior.

[source,console]
------------------------------------------------------------
PUT _inference/sparse_embedding/small_chunk_size
{
"service": "elasticsearch",
"service_settings": {
"num_allocations": 1,
"num_threads": 1
},
"chunking_settings": {
"strategy": "sentence",
"max_chunk_size": 100,
"sentence_overlap": 0
}
}
------------------------------------------------------------
// TEST[skip:TBD]


include::delete-inference.asciidoc[]
include::get-inference.asciidoc[]
include::post-inference.asciidoc[]
Expand Down
34 changes: 33 additions & 1 deletion docs/reference/inference/inference-shared.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,36 @@ end::task-settings[]

tag::task-type[]
The type of the {infer} task that the model will perform.
end::task-type[]
end::task-type[]

tag::chunking-settings[]
Chunking configuration object.
Refer to <<infer-chunking-config>> to learn more about chunking.
end::chunking-settings[]

tag::chunking-settings-max-chunking-size[]
Specifies the maximum size of a chunk in words.
Defaults to `250`.
This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
end::chunking-settings-max-chunking-size[]

tag::chunking-settings-overlap[]
Only for `word` chunking strategy.
Specifies the number of overlapping words for chunks.
Defaults to `100`.
This value cannot be higher than the half of `max_chunking_size`.
end::chunking-settings-overlap[]

tag::chunking-settings-sentence-overlap[]
Only for `sentence` chunking strategy.
Specifies the numnber of overlapping sentences for chunks.
It can be either `1` or `0`.
Defaults to `1`.
end::chunking-settings-sentence-overlap[]

tag::chunking-settings-strategy[]
Specifies the chunking strategy.
It could be either `sentence` or `word`.
end::chunking-settings-strategy[]


Loading

0 comments on commit 19deea7

Please sign in to comment.