Merge branch 'main' into carlosdelest/semantic-text-dense-vector-support

carlosdelest · Mar 5, 2024 · a3bdabf · a3bdabf
2 parents cdc579f + fe13a04
commit a3bdabf
Show file tree

Hide file tree

Showing 17 changed files with 402 additions and 122 deletions.
diff --git a/build-tools/src/main/java/org/elasticsearch/gradle/testclusters/ElasticsearchCluster.java b/build-tools/src/main/java/org/elasticsearch/gradle/testclusters/ElasticsearchCluster.java
@@ -433,7 +433,7 @@ private void commonNodeConfig() {
             if (node.getTestDistribution().equals(TestDistribution.INTEG_TEST)) {
                 node.defaultConfig.put("xpack.security.enabled", "false");
             } else {
-                if (node.getVersion().onOrAfter("7.16.0")) {
+                if (hasDeprecationIndexing(node)) {
                     node.defaultConfig.put("cluster.deprecation_indexing.enabled", "false");
                 }
             }
@@ -474,13 +474,17 @@ public void nextNodeToNextVersion() {
         commonNodeConfig();
         nodeIndex += 1;
         if (node.getTestDistribution().equals(TestDistribution.DEFAULT)) {
-            if (node.getVersion().onOrAfter("7.16.0")) {
+            if (hasDeprecationIndexing(node)) {
                 node.setting("cluster.deprecation_indexing.enabled", "false");
             }
         }
         node.start();
     }
 
+    private static boolean hasDeprecationIndexing(ElasticsearchNode node) {
+        return node.getVersion().onOrAfter("7.16.0") && node.getSettingKeys().contains("stateless.enabled") == false;
+    }
+
     @Override
     public void extraConfigFile(String destination, File from) {
         nodes.all(node -> node.extraConfigFile(destination, from));

diff --git a/docs/reference/mapping/params/format.asciidoc b/docs/reference/mapping/params/format.asciidoc
@@ -70,6 +70,11 @@ The following tables lists all the defaults ISO formats supported:
     (separated by `T`), is optional.
     Examples: `yyyy-MM-dd'T'HH:mm:ss.SSSZ` or  `yyyy-MM-dd`.
 
+    NOTE: When using `date_optional_time`, the parsing is lenient and will attempt to parse
+    numbers as a year (e.g. `292278994` will be parsed as a year). This can lead to unexpected results
+    when paired with a numeric focused format like `epoch_second` and `epoch_millis`.
+    It is recommended you use `strict_date_optional_time` when pairing with a numeric focused format.
+
 [[strict-date-time-nanos]]`strict_date_optional_time_nanos`::
 
     A generic ISO datetime parser, where the date must include the year at a minimum, and the time

diff --git a/...-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java b/...-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java
@@ -9,9 +9,7 @@
 package org.elasticsearch.index.mapper.extras;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermStates;
@@ -300,19 +298,23 @@ public RuntimePhraseScorer scorer(LeafReaderContext context) throws IOException
 
             @Override
             public Matches matches(LeafReaderContext context, int doc) throws IOException {
-                FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field);
-                if (fi == null) {
+                var terms = context.reader().terms(field);
+                if (terms == null) {
                     return null;
                 }
-                // Some highlighters will already have reindexed the source with positions and offsets,
+                // Some highlighters will already have re-indexed the source with positions and offsets,
                 // so rather than doing it again we check to see if this data is available on the
                 // current context and if so delegate directly to the inner query
-                if (fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0) {
+                if (terms.hasOffsets()) {
                     Weight innerWeight = in.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
                     return innerWeight.matches(context, doc);
                 }
                 RuntimePhraseScorer scorer = scorer(context);
-                if (scorer == null || scorer.iterator().advance(doc) != doc) {
+                if (scorer == null) {
+                    return null;
+                }
+                final TwoPhaseIterator twoPhase = scorer.twoPhaseIterator();
+                if (twoPhase.approximation().advance(doc) != doc || scorer.twoPhaseIterator().matches() == false) {
                     return null;
                 }
                 return scorer.matches();
@@ -321,13 +323,14 @@ public Matches matches(LeafReaderContext context, int doc) throws IOException {
     }
 
     private class RuntimePhraseScorer extends Scorer {
-
         private final LeafSimScorer scorer;
         private final CheckedIntFunction<List<Object>, IOException> valueFetcher;
         private final String field;
         private final Query query;
         private final TwoPhaseIterator twoPhase;
 
+        private final MemoryIndexEntry cacheEntry = new MemoryIndexEntry();
+
         private int doc = -1;
         private float freq;
 
@@ -357,7 +360,6 @@ public float matchCost() {
                     // Defaults to a high-ish value so that it likely runs last.
                     return 10_000f;
                 }
-
             };
         }
 
@@ -394,35 +396,35 @@ private float freq() throws IOException {
             return freq;
         }
 
-        private float computeFreq() throws IOException {
-            MemoryIndex index = new MemoryIndex();
-            index.setSimilarity(FREQ_SIMILARITY);
-            List<Object> values = valueFetcher.apply(docID());
-            float frequency = 0;
-            for (Object value : values) {
-                if (value == null) {
-                    continue;
+        private MemoryIndex getOrCreateMemoryIndex() throws IOException {
+            if (cacheEntry.docID != docID()) {
+                cacheEntry.docID = docID();
+                cacheEntry.memoryIndex = new MemoryIndex(true, false);
+                cacheEntry.memoryIndex.setSimilarity(FREQ_SIMILARITY);
+                List<Object> values = valueFetcher.apply(docID());
+                for (Object value : values) {
+                    if (value == null) {
+                        continue;
+                    }
+                    cacheEntry.memoryIndex.addField(field, value.toString(), indexAnalyzer);
                 }
-                index.addField(field, value.toString(), indexAnalyzer);
-                frequency += index.search(query);
-                index.reset();
             }
-            return frequency;
+            return cacheEntry.memoryIndex;
+        }
+
+        private float computeFreq() throws IOException {
+            return getOrCreateMemoryIndex().search(query);
         }
 
         private Matches matches() throws IOException {
-            MemoryIndex index = new MemoryIndex(true, false);
-            List<Object> values = valueFetcher.apply(docID());
-            for (Object value : values) {
-                if (value == null) {
-                    continue;
-                }
-                index.addField(field, value.toString(), indexAnalyzer);
-            }
-            IndexSearcher searcher = index.createSearcher();
+            IndexSearcher searcher = getOrCreateMemoryIndex().createSearcher();
             Weight w = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1);
             return w.matches(searcher.getLeafContexts().get(0), 0);
         }
     }
 
+    private static class MemoryIndexEntry {
+        private int docID = -1;
+        private MemoryIndex memoryIndex;
+    }
 }
diff --git a/...as/src/test/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQueryTests.java b/...as/src/test/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQueryTests.java
@@ -49,13 +49,19 @@
 import java.io.IOException;
 import java.util.Collections;
 import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
 
+import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 
 public class SourceConfirmedTextQueryTests extends ESTestCase {
 
+    private static final AtomicInteger sourceFetchCount = new AtomicInteger();
     private static final IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> SOURCE_FETCHER_PROVIDER =
-        context -> docID -> Collections.<Object>singletonList(context.reader().document(docID).get("body"));
+        context -> docID -> {
+            sourceFetchCount.incrementAndGet();
+            return Collections.<Object>singletonList(context.reader().document(docID).get("body"));
+        };
 
     public void testTerm() throws Exception {
         try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(Lucene.STANDARD_ANALYZER))) {
@@ -440,11 +446,11 @@ public void testEmptyIndex() throws Exception {
     }
 
     public void testMatches() throws Exception {
-        checkMatches(new TermQuery(new Term("body", "d")), "a b c d e", new int[] { 3, 3 });
-        checkMatches(new PhraseQuery("body", "b", "c"), "a b c d c b c a", new int[] { 1, 2, 5, 6 });
+        checkMatches(new TermQuery(new Term("body", "d")), "a b c d e", new int[] { 3, 3 }, false);
+        checkMatches(new PhraseQuery("body", "b", "c"), "a b c d c b c a", new int[] { 1, 2, 5, 6 }, true);
     }
 
-    private static void checkMatches(Query query, String inputDoc, int[] expectedMatches) throws IOException {
+    private static void checkMatches(Query query, String inputDoc, int[] expectedMatches, boolean expectedFetch) throws IOException {
         try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(Lucene.STANDARD_ANALYZER))) {
             Document doc = new Document();
             doc.add(new TextField("body", "xxxxxnomatchxxxx", Store.YES));
@@ -464,30 +470,48 @@ private static void checkMatches(Query query, String inputDoc, int[] expectedMat
             Query sourceConfirmedQuery = new SourceConfirmedTextQuery(query, SOURCE_FETCHER_PROVIDER, Lucene.STANDARD_ANALYZER);
 
             try (IndexReader ir = DirectoryReader.open(w)) {
-
-                IndexSearcher searcher = new IndexSearcher(ir);
-                TopDocs td = searcher.search(
-                    sourceConfirmedQuery,
-                    3,
-                    new Sort(KeywordField.newSortField("sort", false, SortedSetSelector.Type.MAX))
-                );
-
-                Weight weight = searcher.createWeight(searcher.rewrite(sourceConfirmedQuery), ScoreMode.COMPLETE_NO_SCORES, 1);
-
-                int firstDoc = td.scoreDocs[0].doc;
-                LeafReaderContext firstCtx = searcher.getLeafContexts().get(ReaderUtil.subIndex(firstDoc, searcher.getLeafContexts()));
-                checkMatches(weight, firstCtx, firstDoc - firstCtx.docBase, expectedMatches, 0);
-
-                int secondDoc = td.scoreDocs[1].doc;
-                LeafReaderContext secondCtx = searcher.getLeafContexts().get(ReaderUtil.subIndex(secondDoc, searcher.getLeafContexts()));
-                checkMatches(weight, secondCtx, secondDoc - secondCtx.docBase, expectedMatches, 1);
-
+                {
+                    IndexSearcher searcher = new IndexSearcher(ir);
+                    TopDocs td = searcher.search(
+                        sourceConfirmedQuery,
+                        3,
+                        new Sort(KeywordField.newSortField("sort", false, SortedSetSelector.Type.MAX))
+                    );
+
+                    Weight weight = searcher.createWeight(searcher.rewrite(sourceConfirmedQuery), ScoreMode.COMPLETE_NO_SCORES, 1);
+
+                    int firstDoc = td.scoreDocs[0].doc;
+                    LeafReaderContext firstCtx = searcher.getLeafContexts().get(ReaderUtil.subIndex(firstDoc, searcher.getLeafContexts()));
+                    checkMatches(weight, firstCtx, firstDoc - firstCtx.docBase, expectedMatches, 0, expectedFetch);
+
+                    int secondDoc = td.scoreDocs[1].doc;
+                    LeafReaderContext secondCtx = searcher.getLeafContexts()
+                        .get(ReaderUtil.subIndex(secondDoc, searcher.getLeafContexts()));
+                    checkMatches(weight, secondCtx, secondDoc - secondCtx.docBase, expectedMatches, 1, expectedFetch);
+                }
+
+                {
+                    IndexSearcher searcher = new IndexSearcher(ir);
+                    TopDocs td = searcher.search(KeywordField.newExactQuery("sort", "0"), 1);
+
+                    Weight weight = searcher.createWeight(searcher.rewrite(sourceConfirmedQuery), ScoreMode.COMPLETE_NO_SCORES, 1);
+                    int firstDoc = td.scoreDocs[0].doc;
+                    LeafReaderContext firstCtx = searcher.getLeafContexts().get(ReaderUtil.subIndex(firstDoc, searcher.getLeafContexts()));
+                    checkMatches(weight, firstCtx, firstDoc - firstCtx.docBase, new int[0], 0, false);
+                }
             }
         }
     }
 
-    private static void checkMatches(Weight w, LeafReaderContext ctx, int doc, int[] expectedMatches, int offset) throws IOException {
+    private static void checkMatches(Weight w, LeafReaderContext ctx, int doc, int[] expectedMatches, int offset, boolean expectedFetch)
+        throws IOException {
+        int count = sourceFetchCount.get();
         Matches matches = w.matches(ctx, doc);
+        if (expectedMatches.length == 0) {
+            assertNull(matches);
+            assertThat(sourceFetchCount.get() - count, equalTo(expectedFetch ? 1 : 0));
+            return;
+        }
         assertNotNull(matches);
         MatchesIterator mi = matches.getMatches("body");
         int i = 0;
@@ -498,6 +522,7 @@ private static void checkMatches(Weight w, LeafReaderContext ctx, int doc, int[]
             i += 2;
         }
         assertEquals(expectedMatches.length, i);
+        assertThat(sourceFetchCount.get() - count, equalTo(expectedFetch ? 1 : 0));
     }
 
 }