Merge branch 'main' into esql_float_aggs

ChrisHegarty · Jun 17, 2024 · 994ca02 · 994ca02
2 parents d9c2f55 + 7ad9534
commit 994ca02
Show file tree

Hide file tree

Showing 267 changed files with 6,301 additions and 3,575 deletions.
diff --git a/benchmarks/build.gradle b/benchmarks/build.gradle
@@ -41,7 +41,7 @@ dependencies {
   api(project(':x-pack:plugin:esql-core'))
   api(project(':x-pack:plugin:esql'))
   api(project(':x-pack:plugin:esql:compute'))
-  implementation project(path: ':libs:elasticsearch-vec')
+  implementation project(path: ':libs:elasticsearch-simdvec')
   expression(project(path: ':modules:lang-expression', configuration: 'zip'))
   painless(project(path: ':modules:lang-painless', configuration: 'zip'))
   api "org.openjdk.jmh:jmh-core:$versions.jmh"

diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/DistanceFunctionBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/DistanceFunctionBenchmark.java
@@ -56,7 +56,7 @@ public class DistanceFunctionBenchmark {
     @Param({ "96" })
     private int dims;
 
-    @Param({ "dot", "cosine", "l1", "l2" })
+    @Param({ "dot", "cosine", "l1", "l2", "hamming" })
     private String function;
 
     @Param({ "knn", "binary" })
@@ -330,6 +330,18 @@ public void execute(Consumer<Object> consumer) {
         }
     }
 
+    private static class HammingKnnByteBenchmarkFunction extends KnnByteBenchmarkFunction {
+
+        private HammingKnnByteBenchmarkFunction(int dims) {
+            super(dims);
+        }
+
+        @Override
+        public void execute(Consumer<Object> consumer) {
+            new ByteKnnDenseVector(docVector).hamming(queryVector);
+        }
+    }
+
     private static class L1BinaryFloatBenchmarkFunction extends BinaryFloatBenchmarkFunction {
 
         private L1BinaryFloatBenchmarkFunction(int dims) {
@@ -354,6 +366,18 @@ public void execute(Consumer<Object> consumer) {
         }
     }
 
+    private static class HammingBinaryByteBenchmarkFunction extends BinaryByteBenchmarkFunction {
+
+        private HammingBinaryByteBenchmarkFunction(int dims) {
+            super(dims);
+        }
+
+        @Override
+        public void execute(Consumer<Object> consumer) {
+            new ByteBinaryDenseVector(vectorValue, docVector, dims).hamming(queryVector);
+        }
+    }
+
     private static class L2KnnFloatBenchmarkFunction extends KnnFloatBenchmarkFunction {
 
         private L2KnnFloatBenchmarkFunction(int dims) {
@@ -454,6 +478,11 @@ public void setBenchmarkFunction() {
                         case "binary" -> new L2BinaryByteBenchmarkFunction(dims);
                         default -> throw new UnsupportedOperationException("unexpected type [" + type + "]");
                     };
+                    case "hamming" -> benchmarkFunction = switch (type) {
+                        case "knn" -> new HammingKnnByteBenchmarkFunction(dims);
+                        case "binary" -> new HammingBinaryByteBenchmarkFunction(dims);
+                        default -> throw new UnsupportedOperationException("unexpected type [" + type + "]");
+                    };
                     default -> throw new UnsupportedOperationException("unexpected function [" + function + "]");
                 }
             }

diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/VectorScorerBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/VectorScorerBenchmark.java
@@ -22,7 +22,7 @@
 import org.apache.lucene.util.quantization.ScalarQuantizer;
 import org.elasticsearch.common.logging.LogConfigurator;
 import org.elasticsearch.core.IOUtils;
-import org.elasticsearch.vec.VectorScorerFactory;
+import org.elasticsearch.simdvec.VectorScorerFactory;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -41,8 +41,8 @@
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
 
-import static org.elasticsearch.vec.VectorSimilarityType.DOT_PRODUCT;
-import static org.elasticsearch.vec.VectorSimilarityType.EUCLIDEAN;
+import static org.elasticsearch.simdvec.VectorSimilarityType.DOT_PRODUCT;
+import static org.elasticsearch.simdvec.VectorSimilarityType.EUCLIDEAN;
 
 @Fork(value = 1, jvmArgsPrepend = { "--add-modules=jdk.incubator.vector" })
 @Warmup(iterations = 3, time = 3)

diff --git a/...n/java/org/elasticsearch/gradle/internal/InternalDistributionModuleCheckTaskProvider.java b/...n/java/org/elasticsearch/gradle/internal/InternalDistributionModuleCheckTaskProvider.java
@@ -62,8 +62,8 @@ public class InternalDistributionModuleCheckTaskProvider {
         "org.elasticsearch.preallocate",
         "org.elasticsearch.securesm",
         "org.elasticsearch.server",
+        "org.elasticsearch.simdvec",
         "org.elasticsearch.tdigest",
-        "org.elasticsearch.vec",
         "org.elasticsearch.xcontent"
     );
 

diff --git a/docs/changelog/108793.yaml b/docs/changelog/108793.yaml
@@ -0,0 +1,5 @@
+pr: 108793
+summary: Add `SparseVectorStats`
+area: Search
+type: enhancement
+issues: []
diff --git a/docs/changelog/109025.yaml b/docs/changelog/109025.yaml
@@ -0,0 +1,6 @@
+pr: 109025
+summary: Introduce a setting controlling the activation of the `logs` index mode in logs@settings
+area: Logs
+type: feature
+issues:
+ - 108762
diff --git a/docs/changelog/109317.yaml b/docs/changelog/109317.yaml
@@ -0,0 +1,13 @@
+pr: 109317
+summary: Add new int4 quantization to dense_vector
+area: Search
+type: feature
+issues: []
+highlight:
+  title: Add new int4 quantization to dense_vector
+  body: |-
+    New int4 (half-byte) scalar quantization support via two knew index types: `int4_hnsw` and `int4_flat`.
+    This gives an 8x reduction from `float32` with some accuracy loss. In addition to less memory required, this
+    improves query and merge speed significantly when compared to raw vectors.
+  notable: true
+
diff --git a/docs/changelog/109359.yaml b/docs/changelog/109359.yaml
@@ -0,0 +1,5 @@
+pr: 109359
+summary: Adding hamming distance function to painless for `dense_vector` fields
+area: Vector Search
+type: enhancement
+issues: []
diff --git a/docs/changelog/109480.yaml b/docs/changelog/109480.yaml
@@ -0,0 +1,5 @@
+pr: 109480
+summary: "[Connector API] Add claim sync job endpoint"
+area: Application
+type: feature
+issues: []
diff --git a/docs/changelog/109634.yaml b/docs/changelog/109634.yaml
@@ -0,0 +1,5 @@
+pr: 109634
+summary: "[Query Rules] Require Enterprise License for Query Rules"
+area: Relevance
+type: enhancement
+issues: []
diff --git a/docs/changelog/109717.yaml b/docs/changelog/109717.yaml
@@ -0,0 +1,5 @@
+pr: 109717
+summary: Bump jackson version in modules:repository-azure
+area: Snapshot/Restore
+type: upgrade
+issues: []
diff --git a/docs/painless/painless-api-reference/painless-api-reference-score/index.asciidoc b/docs/painless/painless-api-reference/painless-api-reference-score/index.asciidoc
@@ -23,6 +23,7 @@ The following methods are directly callable without a class/instance qualifier.
 * double dotProduct(Object *, String *)
 * double l1norm(Object *, String *)
 * double l2norm(Object *, String *)
+* double hamming(Object *, String *)
 * double randomScore(int *)
 * double randomScore(int *, String *)
 * double saturation(double, double)

diff --git a/docs/reference/cat/nodes.asciidoc b/docs/reference/cat/nodes.asciidoc
@@ -1,14 +1,16 @@
 [[cat-nodes]]
 === cat nodes API
+
 ++++
 <titleabbrev>cat nodes</titleabbrev>
 ++++
 
 [IMPORTANT]
 ====
 cat APIs are only intended for human consumption using the command line or {kib}
-console. They are _not_ intended for use by applications. For application
-consumption, use the <<cluster-nodes-info,nodes info API>>.
+console.
+They are _not_ intended for use by applications.
+For application consumption, use the <<cluster-nodes-info,nodes info API>>.
 ====
 
 Returns information about a cluster's nodes.
@@ -32,13 +34,15 @@ include::{es-ref-dir}/rest-api/common-parms.asciidoc[tag=bytes]
 include::{es-ref-dir}/rest-api/common-parms.asciidoc[tag=http-format]
 
 `full_id`::
-(Optional, Boolean) If `true`, return the full node ID. If `false`, return the
-shortened node ID. Defaults to `false`.
+(Optional, Boolean) If `true`, return the full node ID.
+If `false`, return the shortened node ID.
+Defaults to `false`.
 
 include::{es-ref-dir}/rest-api/common-parms.asciidoc[tag=cat-h]
 +
 --
-If you do not specify which columns to include, the API returns the default columns in the order listed below. If you explicitly specify one or more columns, it only returns the specified columns.
+If you do not specify which columns to include, the API returns the default columns in the order listed below.
+If you explicitly specify one or more columns, it only returns the specified columns.
 
 Valid columns are:
 
@@ -58,7 +62,8 @@ Valid columns are:
 (Default) Used file descriptors percentage, such as `1`.
 
 `node.role`, `r`, `role`, `nodeRole`::
-(Default) Roles of the node. Returned values include
+(Default) Roles of the node.
+Returned values include
 `c` (cold node),
 `d` (data node),
 `f` (frozen node),
@@ -73,12 +78,13 @@ Valid columns are:
 `w` (warm node), and
 `-` (coordinating node only).
 +
-For example, `dim` indicates a master-eligible data and ingest node. See
+For example, `dim` indicates a master-eligible data and ingest node.
+See
 <<modules-node>>.
 
 `master`, `m`::
-(Default) Indicates whether the node is the elected master node. Returned values
-include `*` (elected master) and `-` (not elected master).
+(Default) Indicates whether the node is the elected master node.
+Returned values include `*` (elected master) and `-` (not elected master).
 
 `name`, `n`::
 (Default) Node name, such as `I8hydUG`.
@@ -149,9 +155,6 @@ Node uptime, such as `17.3m`.
 `completion.size`, `cs`, `completionSize`::
 Size of completion, such as `0b`.
 
-`dense_vector.value_count`, `dvc`, `denseVectorCount`::
-Number of indexed dense vector.
-
 `fielddata.memory_size`, `fm`, `fielddataMemory`::
 Used fielddata cache memory, such as `0b`.
 
@@ -306,8 +309,7 @@ Memory used by index writer, such as `18mb`.
 Memory used by version map, such as `1.0kb`.
 
 `segments.fixed_bitset_memory`, `sfbm`, `fixedBitsetMemory`::
-Memory used by fixed bit sets for nested object field types and type filters for
-types referred in <<parent-join,`join`>> fields, such as `1.0kb`.
+Memory used by fixed bit sets for nested object field types and type filters for types referred in <<parent-join,`join`>> fields, such as `1.0kb`.
 
 `suggest.current`, `suc`, `suggestCurrent`::
 Number of current suggest operations, such as `0`.
@@ -362,15 +364,13 @@ ip        heap.percent ram.percent cpu load_1m load_5m load_15m node.role master
 // TESTRESPONSE[s/65          99  42/\\d+ \\d+ \\d+/]
 // TESTRESPONSE[s/dim/.+/ s/[*]/[*]/ s/mJw06l1/.+/ non_json]
 
-The `ip`, `heap.percent`, `ram.percent`, `cpu`, and `load_*` columns provide the
-IP addresses and performance information of each node.
-
-The `node.role`, `master`, and `name` columns provide information useful for
-monitoring an entire cluster, particularly large ones.
+The `ip`, `heap.percent`, `ram.percent`, `cpu`, and `load_*` columns provide the IP addresses and performance information of each node.
 
+The `node.role`, `master`, and `name` columns provide information useful for monitoring an entire cluster, particularly large ones.
 
 [[cat-nodes-api-ex-headings]]
 ===== Example with explicit columns
+
 The following API request returns the `id`, `ip`, `port`, `v` (version), and `m`
 (master) columns.