From 3c885798d1cceea6654f35f397f705c5b3015fb1 Mon Sep 17 00:00:00 2001 From: Luca Cavanna Date: Thu, 12 Jan 2023 13:49:06 +0100 Subject: [PATCH] Avoid doing I/O when fetching min and max for keyword fields (#92026) Whenever sorting on a date, numeric or keyword field (as primary sort), the can_match phase retrieves min and max for the field and sorts the shards (asc or desc depending on the sort order) so that they are going to be queried following that order. This allows incremental results to be exposed in that same order when using async search, as well as optimizations built on top of such behaviour (#51852). For fields with points we call `getMinPackedValue` and `getMaxPackedValue`, while for keyword fields we call `Terms#getMin` and `Terms#getMax`. Elasticsearch uses `FilterTerms` implementations to cancel queries as well as to track field usage. Such filter implementations should delegate their `getMin` and `getMax` calls to the wrapped `Terms` instance, which will leverage info from the block tree that caches min and max, otherwise they are always going to be retrieved from the index, which does I/O and slows the can_match phase down. --- docs/changelog/92026.yaml | 5 + .../internal/ExitableDirectoryReader.java | 12 +- .../FieldUsageTrackingDirectoryReader.java | 16 ++- .../internal/ContextIndexSearcherTests.java | 39 +++++++ ...ieldUsageTrackingDirectoryReaderTests.java | 110 ++++++++++++++++++ 5 files changed, 178 insertions(+), 4 deletions(-) create mode 100644 docs/changelog/92026.yaml create mode 100644 server/src/test/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReaderTests.java diff --git a/docs/changelog/92026.yaml b/docs/changelog/92026.yaml new file mode 100644 index 0000000000000..631daf0235b8b --- /dev/null +++ b/docs/changelog/92026.yaml @@ -0,0 +1,5 @@ +pr: 92026 +summary: Avoid doing I/O when fetching min and max for keyword fields +area: Search +type: bug +issues: [] diff --git a/server/src/main/java/org/elasticsearch/search/internal/ExitableDirectoryReader.java b/server/src/main/java/org/elasticsearch/search/internal/ExitableDirectoryReader.java index 0981afb74d138..3030e4827788b 100644 --- a/server/src/main/java/org/elasticsearch/search/internal/ExitableDirectoryReader.java +++ b/server/src/main/java/org/elasticsearch/search/internal/ExitableDirectoryReader.java @@ -127,7 +127,7 @@ static class ExitableTerms extends FilterLeafReader.FilterTerms { private final QueryCancellation queryCancellation; - private ExitableTerms(Terms terms, QueryCancellation queryCancellation) { + ExitableTerms(Terms terms, QueryCancellation queryCancellation) { super(terms); this.queryCancellation = queryCancellation; } @@ -141,6 +141,16 @@ public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throw public TermsEnum iterator() throws IOException { return new ExitableTermsEnum(in.iterator(), queryCancellation); } + + @Override + public BytesRef getMin() throws IOException { + return in.getMin(); + } + + @Override + public BytesRef getMax() throws IOException { + return in.getMax(); + } } /** diff --git a/server/src/main/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReader.java b/server/src/main/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReader.java index 7b89e9f38fc02..2081df0337324 100644 --- a/server/src/main/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReader.java +++ b/server/src/main/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReader.java @@ -85,11 +85,11 @@ public interface FieldUsageNotifier { void onTermVectorsUsed(String field); } - public static final class FieldUsageTrackingLeafReader extends SequentialStoredFieldsLeafReader { + static final class FieldUsageTrackingLeafReader extends SequentialStoredFieldsLeafReader { private final FieldUsageNotifier notifier; - public FieldUsageTrackingLeafReader(LeafReader in, FieldUsageNotifier notifier) { + FieldUsageTrackingLeafReader(LeafReader in, FieldUsageNotifier notifier) { super(in); this.notifier = notifier; } @@ -230,7 +230,7 @@ public long ramBytesUsed() { } } - private class FieldUsageTrackingTerms extends FilterTerms { + class FieldUsageTrackingTerms extends FilterTerms { private final String field; @@ -268,6 +268,16 @@ public long getSumTotalTermFreq() throws IOException { public long getSumDocFreq() throws IOException { return in.getSumDocFreq(); } + + @Override + public BytesRef getMin() throws IOException { + return in.getMin(); + } + + @Override + public BytesRef getMax() throws IOException { + return in.getMax(); + } } private class FieldUsageTrackingTermsEnum extends FilterTermsEnum { diff --git a/server/src/test/java/org/elasticsearch/search/internal/ContextIndexSearcherTests.java b/server/src/test/java/org/elasticsearch/search/internal/ContextIndexSearcherTests.java index 12c986f36f732..81ab7e2990326 100644 --- a/server/src/test/java/org/elasticsearch/search/internal/ContextIndexSearcherTests.java +++ b/server/src/test/java/org/elasticsearch/search/internal/ContextIndexSearcherTests.java @@ -16,6 +16,7 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FilterDirectoryReader; +import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -24,6 +25,7 @@ import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.BulkScorer; @@ -270,6 +272,43 @@ public void onRemoval(ShardId shardId, Accountable accountable) { IOUtils.close(reader, w, dir); } + public void testExitableTermsMinAndMax() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(null)); + Document doc = new Document(); + StringField fooField = new StringField("foo", "bar", Field.Store.NO); + doc.add(fooField); + w.addDocument(doc); + w.flush(); + + DirectoryReader directoryReader = DirectoryReader.open(w); + for (LeafReaderContext lfc : directoryReader.leaves()) { + Terms terms = lfc.reader().terms("foo"); + FilterLeafReader.FilterTerms filterTerms = new ExitableTerms(terms, new ExitableDirectoryReader.QueryCancellation() { + @Override + public boolean isEnabled() { + return false; + } + + @Override + public void checkCancelled() { + + } + }) { + @Override + public TermsEnum iterator() { + fail("Retrieving min and max should retrieve values from block tree instead of iterating"); + return null; + } + }; + assertEquals("bar", filterTerms.getMin().utf8ToString()); + assertEquals("bar", filterTerms.getMax().utf8ToString()); + } + w.close(); + directoryReader.close(); + dir.close(); + } + private SparseFixedBitSet query(LeafReaderContext leaf, String field, String value) throws IOException { SparseFixedBitSet sparseFixedBitSet = new SparseFixedBitSet(leaf.reader().maxDoc()); TermsEnum tenum = leaf.reader().terms(field).iterator(); diff --git a/server/src/test/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReaderTests.java b/server/src/test/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReaderTests.java new file mode 100644 index 0000000000000..249746a594605 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/search/internal/FieldUsageTrackingDirectoryReaderTests.java @@ -0,0 +1,110 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.search.internal; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.Directory; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; + +public class FieldUsageTrackingDirectoryReaderTests extends ESTestCase { + + public void testTermsMinAndMax() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(null)); + Document doc = new Document(); + StringField fooField = new StringField("foo", "bar", Field.Store.NO); + doc.add(fooField); + w.addDocument(doc); + w.flush(); + + DirectoryReader directoryReader = DirectoryReader.open(w); + for (LeafReaderContext lrc : directoryReader.leaves()) { + FieldUsageTrackingDirectoryReader.FieldUsageTrackingLeafReader leafReader = + new FieldUsageTrackingDirectoryReader.FieldUsageTrackingLeafReader(lrc.reader(), new TestFieldUsageNotifier()); + FieldUsageTrackingDirectoryReader.FieldUsageTrackingLeafReader.FieldUsageTrackingTerms terms = + leafReader.new FieldUsageTrackingTerms("foo", lrc.reader().terms("foo")) { + @Override + public TermsEnum iterator() { + fail("Retrieving min and max should retrieve values from block tree instead of iterating"); + return null; + } + }; + assertEquals("bar", terms.getMin().utf8ToString()); + assertEquals("bar", terms.getMax().utf8ToString()); + } + w.close(); + directoryReader.close(); + dir.close(); + } + + private static class TestFieldUsageNotifier implements FieldUsageTrackingDirectoryReader.FieldUsageNotifier { + @Override + public void onTermsUsed(String field) { + + } + + @Override + public void onPostingsUsed(String field) { + + } + + @Override + public void onTermFrequenciesUsed(String field) { + + } + + @Override + public void onPositionsUsed(String field) { + + } + + @Override + public void onOffsetsUsed(String field) { + + } + + @Override + public void onDocValuesUsed(String field) { + + } + + @Override + public void onStoredFieldsUsed(String field) { + + } + + @Override + public void onNormsUsed(String field) { + + } + + @Override + public void onPayloadsUsed(String field) { + + } + + @Override + public void onPointsUsed(String field) { + + } + + @Override + public void onTermVectorsUsed(String field) { + + } + } +}