diff --git a/release-notes/opensearch.release-notes-2.8.0.md b/release-notes/opensearch.release-notes-2.8.0.md index 556e1f0047595..4f7dc38997e1e 100644 --- a/release-notes/opensearch.release-notes-2.8.0.md +++ b/release-notes/opensearch.release-notes-2.8.0.md @@ -10,6 +10,7 @@ - [Search Pipelines] Add RenameFieldResponseProcessor for Search Pipelines ([#7377](https://github.com/opensearch-project/OpenSearch/pull/7377)) - [Search Pipelines] Split search pipeline processor factories by type ([#7597](https://github.com/opensearch-project/OpenSearch/pull/7597)) - [Search Pipelines] Add script processor ([#7607](https://github.com/opensearch-project/OpenSearch/pull/7607)) +- Add descending order search optimization through reverse segment read. ([#7244](https://github.com/opensearch-project/OpenSearch/pull/7244)) - Add 'unsigned_long' numeric field type ([#6237](https://github.com/opensearch-project/OpenSearch/pull/6237)) - Add back primary shard preference for queries ([#7375](https://github.com/opensearch-project/OpenSearch/pull/7375)) - Add task cancellation timestamp in task API ([#7455](https://github.com/opensearch-project/OpenSearch/pull/7455)) diff --git a/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java b/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java index 825aaee1ad1f8..f4be1cfff489c 100644 --- a/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java +++ b/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java @@ -31,6 +31,10 @@ package org.opensearch.cluster.metadata; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.PointValues; +import org.opensearch.OpenSearchException; import org.opensearch.cluster.AbstractDiffable; import org.opensearch.cluster.Diff; import org.opensearch.core.ParseField; @@ -46,6 +50,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.Map; @@ -59,6 +64,24 @@ public final class DataStream extends AbstractDiffable implements ToXContentObject { public static final String BACKING_INDEX_PREFIX = ".ds-"; + public static final String TIMESERIES_FIELDNAME = "@timestamp"; + public static final Comparator TIMESERIES_LEAF_SORTER = Comparator.comparingLong((LeafReader r) -> { + try { + PointValues points = r.getPointValues(TIMESERIES_FIELDNAME); + if (points != null) { + // could be a multipoint (probably not) but get the maximum time value anyway + byte[] sortValue = points.getMaxPackedValue(); + // decode the first dimension because this should not be a multi dimension field + // it's a bug in the date field if it is + return LongPoint.decodeDimension(sortValue, 0); + } else { + // segment does not have a timestamp field, just return the minimum value + return Long.MIN_VALUE; + } + } catch (IOException e) { + throw new OpenSearchException("Not a timeseries Index! Field [{}] not found!", TIMESERIES_FIELDNAME); + } + }).reversed(); private final String name; private final TimestampField timeStampField; diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java index e125facb76059..de7dc102939ce 100644 --- a/server/src/main/java/org/opensearch/index/IndexSettings.java +++ b/server/src/main/java/org/opensearch/index/IndexSettings.java @@ -665,6 +665,7 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) { private volatile long mappingTotalFieldsLimit; private volatile long mappingDepthLimit; private volatile long mappingFieldNameLengthLimit; + private volatile boolean searchSegmentOrderReversed; /** * The maximum number of refresh listeners allows on this shard. @@ -905,6 +906,10 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti scopedSettings.addSettingsUpdateConsumer(DEFAULT_SEARCH_PIPELINE, this::setDefaultSearchPipeline); } + private void setSearchSegmentOrderReversed(boolean reversed) { + this.searchSegmentOrderReversed = reversed; + } + private void setSearchIdleAfter(TimeValue searchIdleAfter) { this.searchIdleAfter = searchIdleAfter; } @@ -1084,6 +1089,13 @@ public Settings getNodeSettings() { return nodeSettings; } + /** + * Returns true if index level setting for leaf reverse order search optimization is enabled + */ + public boolean getSearchSegmentOrderReversed() { + return this.searchSegmentOrderReversed; + } + /** * Updates the settings and index metadata and notifies all registered settings consumers with the new settings iff at least one * setting has changed. diff --git a/server/src/main/java/org/opensearch/index/engine/EngineConfig.java b/server/src/main/java/org/opensearch/index/engine/EngineConfig.java index fe003405fd3f8..338a541af387a 100644 --- a/server/src/main/java/org/opensearch/index/engine/EngineConfig.java +++ b/server/src/main/java/org/opensearch/index/engine/EngineConfig.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; @@ -59,6 +60,7 @@ import org.opensearch.indices.breaker.CircuitBreakerService; import org.opensearch.threadpool.ThreadPool; +import java.util.Comparator; import java.util.List; import java.util.Objects; import java.util.function.BooleanSupplier; @@ -102,6 +104,7 @@ public final class EngineConfig { private final Supplier retentionLeasesSupplier; private final boolean isReadOnlyReplica; private final BooleanSupplier primaryModeSupplier; + private final Comparator leafSorter; /** * A supplier of the outstanding retention leases. This is used during merged operations to determine which operations that have been @@ -204,6 +207,7 @@ private EngineConfig(Builder builder) { this.isReadOnlyReplica = builder.isReadOnlyReplica; this.primaryModeSupplier = builder.primaryModeSupplier; this.translogFactory = builder.translogFactory; + this.leafSorter = builder.leafSorter; } /** @@ -451,6 +455,15 @@ public TranslogDeletionPolicyFactory getCustomTranslogDeletionPolicyFactory() { return translogDeletionPolicyFactory; } + /** + * Returns subReaderSorter for org.apache.lucene.index.BaseCompositeReader. + * This gets used in lucene IndexReader and decides order of segment read. + * @return comparator + */ + public Comparator getLeafSorter() { + return this.leafSorter; + } + /** * Builder for EngineConfig class * @@ -483,6 +496,7 @@ public static class Builder { private boolean isReadOnlyReplica; private BooleanSupplier primaryModeSupplier; private TranslogFactory translogFactory = new InternalTranslogFactory(); + Comparator leafSorter; public Builder shardId(ShardId shardId) { this.shardId = shardId; @@ -614,6 +628,11 @@ public Builder translogFactory(TranslogFactory translogFactory) { return this; } + public Builder leafSorter(Comparator leafSorter) { + this.leafSorter = leafSorter; + return this; + } + public EngineConfig build() { return new EngineConfig(this); } diff --git a/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java b/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java index f5a5d50e11220..76b13ee244a2c 100644 --- a/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java +++ b/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java @@ -10,6 +10,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; @@ -36,6 +37,7 @@ import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Optional; import java.util.function.BooleanSupplier; @@ -151,7 +153,8 @@ public EngineConfig newEngineConfig( EngineConfig.TombstoneDocSupplier tombstoneDocSupplier, boolean isReadOnlyReplica, BooleanSupplier primaryModeSupplier, - TranslogFactory translogFactory + TranslogFactory translogFactory, + Comparator leafSorter ) { CodecService codecServiceToUse = codecService; if (codecService == null && this.codecServiceFactory != null) { @@ -184,6 +187,7 @@ public EngineConfig newEngineConfig( .readOnlyReplica(isReadOnlyReplica) .primaryModeSupplier(primaryModeSupplier) .translogFactory(translogFactory) + .leafSorter(leafSorter) .build(); } diff --git a/server/src/main/java/org/opensearch/index/engine/InternalEngine.java b/server/src/main/java/org/opensearch/index/engine/InternalEngine.java index 63afc6585a99d..b96bade177be2 100644 --- a/server/src/main/java/org/opensearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/InternalEngine.java @@ -2322,6 +2322,9 @@ private IndexWriterConfig getIndexWriterConfig() { if (config().getIndexSort() != null) { iwc.setIndexSort(config().getIndexSort()); } + if (config().getLeafSorter() != null) { + iwc.setLeafSorter(config().getLeafSorter()); // The default segment search order + } return iwc; } diff --git a/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java b/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java index 5bccb4f6e827e..024f4b71584bf 100644 --- a/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java +++ b/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java @@ -33,6 +33,7 @@ package org.opensearch.index.mapper; import org.apache.lucene.analysis.Analyzer; +import org.opensearch.cluster.metadata.DataStream; import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.FieldNameAnalyzer; @@ -261,6 +262,15 @@ public String getNestedScope(String path) { return null; } + /** + * If this index contains @timestamp field with Date type, it will return true + * @return true or false based on above condition + */ + public boolean containsTimeStampField() { + MappedFieldType timeSeriesFieldType = this.fieldTypeLookup.get(DataStream.TIMESERIES_FIELDNAME); + return timeSeriesFieldType != null && timeSeriesFieldType instanceof DateFieldMapper.DateFieldType; // has to be Date field type + } + private static String parentObject(String field) { int lastDot = field.lastIndexOf('.'); if (lastDot == -1) { diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index fc236d7d97a04..c0e7a8278e821 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -58,6 +58,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ThreadInterruptedException; import org.opensearch.common.lucene.store.ByteArrayIndexInput; +import org.opensearch.cluster.metadata.DataStream; import org.opensearch.core.Assertions; import org.opensearch.ExceptionsHelper; import org.opensearch.OpenSearchException; @@ -333,6 +334,7 @@ Runnable getGlobalCheckpointSyncer() { private volatile boolean useRetentionLeasesInPeerRecovery; private final Store remoteStore; private final BiFunction translogFactorySupplier; + private final boolean isTimeSeriesIndex; private final RemoteRefreshSegmentPressureService remoteRefreshSegmentPressureService; public IndexShard( @@ -451,6 +453,9 @@ public boolean shouldCache(Query query) { this.checkpointPublisher = checkpointPublisher; this.remoteStore = remoteStore; this.translogFactorySupplier = translogFactorySupplier; + this.isTimeSeriesIndex = (mapperService == null || mapperService.documentMapper() == null) + ? false + : mapperService.documentMapper().mappers().containsTimeStampField(); this.remoteRefreshSegmentPressureService = remoteRefreshSegmentPressureService; } @@ -3627,7 +3632,8 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) thro tombstoneDocSupplier(), isReadOnlyReplica, replicationTracker::isPrimaryMode, - translogFactorySupplier.apply(indexSettings, shardRouting) + translogFactorySupplier.apply(indexSettings, shardRouting), + isTimeSeriesIndex ? DataStream.TIMESERIES_LEAF_SORTER : null // DESC @timestamp default order for timeseries ); } @@ -4670,4 +4676,12 @@ RetentionLeaseSyncer getRetentionLeaseSyncer() { public GatedCloseable getSegmentInfosSnapshot() { return getEngine().getSegmentInfosSnapshot(); } + + /** + * If index is time series (if it contains @timestamp field) + * @return true or false based on above condition + */ + public boolean isTimeSeriesIndex() { + return this.isTimeSeriesIndex; + } } diff --git a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java index cf21712fc912a..1a84aba5fff7d 100644 --- a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java +++ b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java @@ -75,6 +75,7 @@ import org.opensearch.search.query.QuerySearchResult; import org.opensearch.search.sort.FieldSortBuilder; import org.opensearch.search.sort.MinAndMax; +import org.opensearch.search.sort.SortOrder; import java.io.IOException; import java.util.ArrayList; @@ -282,8 +283,17 @@ public void search( @Override protected void search(List leaves, Weight weight, Collector collector) throws IOException { - for (LeafReaderContext ctx : leaves) { // search each subreader - searchLeaf(ctx, weight, collector); + if (shouldReverseLeafReaderContexts()) { + // reverse the segment search order if this flag is true. + // Certain queries can benefit if we reverse the segment read order, + // for example time series based queries if searched for desc sort order. + for (int i = leaves.size() - 1; i >= 0; i--) { + searchLeaf(leaves.get(i), weight, collector); + } + } else { + for (int i = 0; i < leaves.size(); i++) { + searchLeaf(leaves.get(i), weight, collector); + } } } @@ -496,4 +506,22 @@ private boolean canMatchSearchAfter(LeafReaderContext ctx) throws IOException { } return true; } + + private boolean shouldReverseLeafReaderContexts() { + // Time series based workload by default traverses segments in desc order i.e. latest to the oldest order. + // This is actually beneficial for search queries to start search on latest segments first for time series workload. + // That can slow down ASC order queries on timestamp workload. So to avoid that slowdown, we will reverse leaf + // reader order here. + if (searchContext != null && searchContext.indexShard().isTimeSeriesIndex()) { + // Only reverse order for asc order sort queries + if (searchContext.request() != null + && searchContext.request().source() != null + && searchContext.request().source().sorts() != null + && searchContext.request().source().sorts().size() > 0 + && searchContext.request().source().sorts().get(0).order() == SortOrder.ASC) { + return true; + } + } + return false; + } } diff --git a/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java b/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java index 2db3cd24da80d..f8bedc76ea994 100644 --- a/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java +++ b/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java @@ -69,7 +69,8 @@ public void testCreateEngineConfigFromFactory() { null, false, () -> Boolean.TRUE, - new InternalTranslogFactory() + new InternalTranslogFactory(), + null ); assertNotNull(config.getCodec()); @@ -148,7 +149,8 @@ public void testCreateCodecServiceFromFactory() { null, false, () -> Boolean.TRUE, - new InternalTranslogFactory() + new InternalTranslogFactory(), + null ); assertNotNull(config.getCodec()); } diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java index 06f74a55be13d..4474aae1f0631 100644 --- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java +++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java @@ -761,6 +761,7 @@ public Settings indexSettings() { ).getStringRep() ); } + return builder.build(); }