diff --git a/CHANGELOG.md b/CHANGELOG.md index c862b02c62f14..60e45730ba032 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -82,6 +82,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased 2.x] ### Added - [Extensions] Moving Extensions APIs to protobuf serialization. ([#6960](https://github.com/opensearch-project/OpenSearch/pull/6960)) +- Add descending order search optimization through reverse segment read. ([#7244](https://github.com/opensearch-project/OpenSearch/pull/7244)) ### Dependencies diff --git a/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java b/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java index 825aaee1ad1f8..f4be1cfff489c 100644 --- a/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java +++ b/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java @@ -31,6 +31,10 @@ package org.opensearch.cluster.metadata; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.PointValues; +import org.opensearch.OpenSearchException; import org.opensearch.cluster.AbstractDiffable; import org.opensearch.cluster.Diff; import org.opensearch.core.ParseField; @@ -46,6 +50,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.Map; @@ -59,6 +64,24 @@ public final class DataStream extends AbstractDiffable implements ToXContentObject { public static final String BACKING_INDEX_PREFIX = ".ds-"; + public static final String TIMESERIES_FIELDNAME = "@timestamp"; + public static final Comparator TIMESERIES_LEAF_SORTER = Comparator.comparingLong((LeafReader r) -> { + try { + PointValues points = r.getPointValues(TIMESERIES_FIELDNAME); + if (points != null) { + // could be a multipoint (probably not) but get the maximum time value anyway + byte[] sortValue = points.getMaxPackedValue(); + // decode the first dimension because this should not be a multi dimension field + // it's a bug in the date field if it is + return LongPoint.decodeDimension(sortValue, 0); + } else { + // segment does not have a timestamp field, just return the minimum value + return Long.MIN_VALUE; + } + } catch (IOException e) { + throw new OpenSearchException("Not a timeseries Index! Field [{}] not found!", TIMESERIES_FIELDNAME); + } + }).reversed(); private final String name; private final TimestampField timeStampField; diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java index c10de678054d0..45c1ecb3a00dc 100644 --- a/server/src/main/java/org/opensearch/index/IndexSettings.java +++ b/server/src/main/java/org/opensearch/index/IndexSettings.java @@ -653,6 +653,7 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) { private volatile long mappingTotalFieldsLimit; private volatile long mappingDepthLimit; private volatile long mappingFieldNameLengthLimit; + private volatile boolean searchSegmentOrderReversed; /** * The maximum number of refresh listeners allows on this shard. @@ -897,6 +898,10 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti scopedSettings.addSettingsUpdateConsumer(INDEX_MERGE_ON_FLUSH_POLICY, this::setMergeOnFlushPolicy); } + private void setSearchSegmentOrderReversed(boolean reversed) { + this.searchSegmentOrderReversed = reversed; + } + private void setSearchIdleAfter(TimeValue searchIdleAfter) { this.searchIdleAfter = searchIdleAfter; } @@ -1068,6 +1073,13 @@ public Settings getNodeSettings() { return nodeSettings; } + /** + * Returns true if index level setting for leaf reverse order search optimization is enabled + */ + public boolean getSearchSegmentOrderReversed() { + return this.searchSegmentOrderReversed; + } + /** * Updates the settings and index metadata and notifies all registered settings consumers with the new settings iff at least one * setting has changed. diff --git a/server/src/main/java/org/opensearch/index/engine/EngineConfig.java b/server/src/main/java/org/opensearch/index/engine/EngineConfig.java index fe003405fd3f8..338a541af387a 100644 --- a/server/src/main/java/org/opensearch/index/engine/EngineConfig.java +++ b/server/src/main/java/org/opensearch/index/engine/EngineConfig.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; @@ -59,6 +60,7 @@ import org.opensearch.indices.breaker.CircuitBreakerService; import org.opensearch.threadpool.ThreadPool; +import java.util.Comparator; import java.util.List; import java.util.Objects; import java.util.function.BooleanSupplier; @@ -102,6 +104,7 @@ public final class EngineConfig { private final Supplier retentionLeasesSupplier; private final boolean isReadOnlyReplica; private final BooleanSupplier primaryModeSupplier; + private final Comparator leafSorter; /** * A supplier of the outstanding retention leases. This is used during merged operations to determine which operations that have been @@ -204,6 +207,7 @@ private EngineConfig(Builder builder) { this.isReadOnlyReplica = builder.isReadOnlyReplica; this.primaryModeSupplier = builder.primaryModeSupplier; this.translogFactory = builder.translogFactory; + this.leafSorter = builder.leafSorter; } /** @@ -451,6 +455,15 @@ public TranslogDeletionPolicyFactory getCustomTranslogDeletionPolicyFactory() { return translogDeletionPolicyFactory; } + /** + * Returns subReaderSorter for org.apache.lucene.index.BaseCompositeReader. + * This gets used in lucene IndexReader and decides order of segment read. + * @return comparator + */ + public Comparator getLeafSorter() { + return this.leafSorter; + } + /** * Builder for EngineConfig class * @@ -483,6 +496,7 @@ public static class Builder { private boolean isReadOnlyReplica; private BooleanSupplier primaryModeSupplier; private TranslogFactory translogFactory = new InternalTranslogFactory(); + Comparator leafSorter; public Builder shardId(ShardId shardId) { this.shardId = shardId; @@ -614,6 +628,11 @@ public Builder translogFactory(TranslogFactory translogFactory) { return this; } + public Builder leafSorter(Comparator leafSorter) { + this.leafSorter = leafSorter; + return this; + } + public EngineConfig build() { return new EngineConfig(this); } diff --git a/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java b/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java index f5a5d50e11220..76b13ee244a2c 100644 --- a/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java +++ b/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java @@ -10,6 +10,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; @@ -36,6 +37,7 @@ import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Optional; import java.util.function.BooleanSupplier; @@ -151,7 +153,8 @@ public EngineConfig newEngineConfig( EngineConfig.TombstoneDocSupplier tombstoneDocSupplier, boolean isReadOnlyReplica, BooleanSupplier primaryModeSupplier, - TranslogFactory translogFactory + TranslogFactory translogFactory, + Comparator leafSorter ) { CodecService codecServiceToUse = codecService; if (codecService == null && this.codecServiceFactory != null) { @@ -184,6 +187,7 @@ public EngineConfig newEngineConfig( .readOnlyReplica(isReadOnlyReplica) .primaryModeSupplier(primaryModeSupplier) .translogFactory(translogFactory) + .leafSorter(leafSorter) .build(); } diff --git a/server/src/main/java/org/opensearch/index/engine/InternalEngine.java b/server/src/main/java/org/opensearch/index/engine/InternalEngine.java index 5b25487bff234..3abda9de90cf6 100644 --- a/server/src/main/java/org/opensearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/InternalEngine.java @@ -2322,6 +2322,9 @@ private IndexWriterConfig getIndexWriterConfig() { if (config().getIndexSort() != null) { iwc.setIndexSort(config().getIndexSort()); } + if (config().getLeafSorter() != null) { + iwc.setLeafSorter(config().getLeafSorter()); // The default segment search order + } return iwc; } diff --git a/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java b/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java index 5bccb4f6e827e..024f4b71584bf 100644 --- a/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java +++ b/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java @@ -33,6 +33,7 @@ package org.opensearch.index.mapper; import org.apache.lucene.analysis.Analyzer; +import org.opensearch.cluster.metadata.DataStream; import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.FieldNameAnalyzer; @@ -261,6 +262,15 @@ public String getNestedScope(String path) { return null; } + /** + * If this index contains @timestamp field with Date type, it will return true + * @return true or false based on above condition + */ + public boolean containsTimeStampField() { + MappedFieldType timeSeriesFieldType = this.fieldTypeLookup.get(DataStream.TIMESERIES_FIELDNAME); + return timeSeriesFieldType != null && timeSeriesFieldType instanceof DateFieldMapper.DateFieldType; // has to be Date field type + } + private static String parentObject(String field) { int lastDot = field.lastIndexOf('.'); if (lastDot == -1) { diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 8cd1a182fbdfe..3453ee196077c 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -56,6 +56,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ThreadInterruptedException; +import org.opensearch.cluster.metadata.DataStream; import org.opensearch.core.Assertions; import org.opensearch.ExceptionsHelper; import org.opensearch.OpenSearchException; @@ -326,6 +327,8 @@ Runnable getGlobalCheckpointSyncer() { private final Store remoteStore; private final BiFunction translogFactorySupplier; + private final boolean isTimeSeriesIndex; + public IndexShard( final ShardRouting shardRouting, final IndexSettings indexSettings, @@ -441,6 +444,9 @@ public boolean shouldCache(Query query) { this.checkpointPublisher = checkpointPublisher; this.remoteStore = remoteStore; this.translogFactorySupplier = translogFactorySupplier; + this.isTimeSeriesIndex = (mapperService == null || mapperService.documentMapper() == null) + ? false + : mapperService.documentMapper().mappers().containsTimeStampField(); } public ThreadPool getThreadPool() { @@ -3580,7 +3586,8 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) thro tombstoneDocSupplier(), isReadOnlyReplica, replicationTracker::isPrimaryMode, - translogFactorySupplier.apply(indexSettings, shardRouting) + translogFactorySupplier.apply(indexSettings, shardRouting), + isTimeSeriesIndex ? DataStream.TIMESERIES_LEAF_SORTER : null // DESC @timestamp default order for timeseries ); } @@ -4594,4 +4601,12 @@ RetentionLeaseSyncer getRetentionLeaseSyncer() { public GatedCloseable getSegmentInfosSnapshot() { return getEngine().getSegmentInfosSnapshot(); } + + /** + * If index is time series (if it contains @timestamp field) + * @return true or false based on above condition + */ + public boolean isTimeSeriesIndex() { + return this.isTimeSeriesIndex; + } } diff --git a/server/src/main/java/org/opensearch/search/DefaultSearchContext.java b/server/src/main/java/org/opensearch/search/DefaultSearchContext.java index b4278681346e3..1bde741973983 100644 --- a/server/src/main/java/org/opensearch/search/DefaultSearchContext.java +++ b/server/src/main/java/org/opensearch/search/DefaultSearchContext.java @@ -89,6 +89,7 @@ import org.opensearch.search.rescore.RescoreContext; import org.opensearch.search.slice.SliceBuilder; import org.opensearch.search.sort.SortAndFormats; +import org.opensearch.search.sort.SortOrder; import org.opensearch.search.suggest.SuggestionSearchContext; import java.io.IOException; @@ -210,7 +211,8 @@ final class DefaultSearchContext extends SearchContext { engineSearcher.getQueryCache(), engineSearcher.getQueryCachingPolicy(), lowLevelCancellation, - executor + executor, + shouldReverseLeafReaderContexts() ); this.relativeTimeSupplier = relativeTimeSupplier; this.timeout = timeout; @@ -885,4 +887,22 @@ public boolean isCancelled() { public ReaderContext readerContext() { return readerContext; } + + private boolean shouldReverseLeafReaderContexts() { + // Time series based workload by default traverses segments in desc order i.e. latest to the oldest order. + // This is actually beneficial for search queries to start search on latest segments first for time series workload. + // That can slow down ASC order queries on timestamp workload. So to avoid that slowdown, we will reverse leaf + // reader order here. + if (this.indexShard.isTimeSeriesIndex()) { + // Only reverse order for asc order sort queries + if (request != null + && request.source() != null + && request.source().sorts() != null + && request.source().sorts().size() > 0 + && request.source().sorts().get(0).order() == SortOrder.ASC) { + return true; + } + } + return false; + } } diff --git a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java index a7f509fd24c2b..818da075fd7ec 100644 --- a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java +++ b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java @@ -98,6 +98,12 @@ public class ContextIndexSearcher extends IndexSearcher implements Releasable { private QueryProfiler profiler; private MutableQueryTimeout cancellable; + /** + * Certain queries can benefit if we reverse the segment read order, + * for example time series based queries if searched for desc sort order + */ + private final boolean reverseLeafReaderContexts; + public ContextIndexSearcher( IndexReader reader, Similarity similarity, @@ -106,7 +112,37 @@ public ContextIndexSearcher( boolean wrapWithExitableDirectoryReader, Executor executor ) throws IOException { - this(reader, similarity, queryCache, queryCachingPolicy, new MutableQueryTimeout(), wrapWithExitableDirectoryReader, executor); + this( + reader, + similarity, + queryCache, + queryCachingPolicy, + new MutableQueryTimeout(), + wrapWithExitableDirectoryReader, + executor, + false + ); + } + + public ContextIndexSearcher( + IndexReader reader, + Similarity similarity, + QueryCache queryCache, + QueryCachingPolicy queryCachingPolicy, + boolean wrapWithExitableDirectoryReader, + Executor executor, + boolean reverseLeafReaderContexts + ) throws IOException { + this( + reader, + similarity, + queryCache, + queryCachingPolicy, + new MutableQueryTimeout(), + wrapWithExitableDirectoryReader, + executor, + reverseLeafReaderContexts + ); } private ContextIndexSearcher( @@ -116,13 +152,15 @@ private ContextIndexSearcher( QueryCachingPolicy queryCachingPolicy, MutableQueryTimeout cancellable, boolean wrapWithExitableDirectoryReader, - Executor executor + Executor executor, + boolean reverseLeafReaderContexts ) throws IOException { super(wrapWithExitableDirectoryReader ? new ExitableDirectoryReader((DirectoryReader) reader, cancellable) : reader, executor); setSimilarity(similarity); setQueryCache(queryCache); setQueryCachingPolicy(queryCachingPolicy); this.cancellable = cancellable; + this.reverseLeafReaderContexts = reverseLeafReaderContexts; } public void setProfiler(QueryProfiler profiler) { @@ -246,8 +284,15 @@ public void search( @Override protected void search(List leaves, Weight weight, Collector collector) throws IOException { - for (LeafReaderContext ctx : leaves) { // search each subreader - searchLeaf(ctx, weight, collector); + if (reverseLeafReaderContexts) { + // reverse the segment search order if this flag is true. + for (int i = leaves.size() - 1; i >= 0; i--) { + searchLeaf(leaves.get(i), weight, collector); + } + } else { + for (int i = 0; i < leaves.size(); i++) { + searchLeaf(leaves.get(i), weight, collector); + } } } diff --git a/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java b/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java index 2db3cd24da80d..f8bedc76ea994 100644 --- a/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java +++ b/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java @@ -69,7 +69,8 @@ public void testCreateEngineConfigFromFactory() { null, false, () -> Boolean.TRUE, - new InternalTranslogFactory() + new InternalTranslogFactory(), + null ); assertNotNull(config.getCodec()); @@ -148,7 +149,8 @@ public void testCreateCodecServiceFromFactory() { null, false, () -> Boolean.TRUE, - new InternalTranslogFactory() + new InternalTranslogFactory(), + null ); assertNotNull(config.getCodec()); } diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java index 2cdcd70869069..5af7be76b7b86 100644 --- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java +++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java @@ -761,6 +761,7 @@ public Settings indexSettings() { ).getStringRep() ); } + return builder.build(); }