diff --git a/CHANGELOG.md b/CHANGELOG.md index 683fda670..2fea42b6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670)) - Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693)) - Removed stream.findFirst implementation to use more native iteration implement to improve hybrid query latencies by 35% ([#706](https://github.com/opensearch-project/neural-search/pull/706)) +- Removed map of subquery to subquery index in favor of storing index as part of disi wrapper to improve hybrid query latencies by 20% ([#711](https://github.com/opensearch-project/neural-search/pull/711)) ### Bug Fixes - Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663)) ### Infrastructure diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java index 1da610f53..32029d8c5 100644 --- a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java +++ b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java @@ -8,18 +8,13 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.HashMap; import java.util.List; -import java.util.Locale; -import java.util.Map; import java.util.Objects; -import com.google.common.primitives.Ints; import org.apache.lucene.search.DisiPriorityQueue; import org.apache.lucene.search.DisiWrapper; import org.apache.lucene.search.DisjunctionDISIApproximation; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; @@ -27,6 +22,7 @@ import lombok.Getter; import org.apache.lucene.util.PriorityQueue; +import org.opensearch.neuralsearch.search.HybridDisiWrapper; /** * Class abstracts functionality of Scorer for hybrid query. When iterating over documents in increasing @@ -43,8 +39,6 @@ public final class HybridQueryScorer extends Scorer { private final float[] subScores; - private final Map queryToIndex; - private final DocIdSetIterator approximation; private final HybridScoreBlockBoundaryPropagator disjunctionBlockPropagator; private final TwoPhase twoPhase; @@ -57,7 +51,6 @@ public HybridQueryScorer(final Weight weight, final List subScorers) thr super(weight); this.subScorers = Collections.unmodifiableList(subScorers); subScores = new float[subScorers.size()]; - this.queryToIndex = mapQueryToIndex(); this.subScorersPQ = initializeSubScorersPQ(); boolean needsScores = scoreMode != ScoreMode.COMPLETE_NO_SCORES; @@ -194,70 +187,30 @@ public int docID() { */ public float[] hybridScores() throws IOException { float[] scores = new float[subScores.length]; - DisiWrapper topList = subScorersPQ.topList(); - for (DisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = disiWrapper.next) { + HybridDisiWrapper topList = (HybridDisiWrapper) subScorersPQ.topList(); + for (HybridDisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = (HybridDisiWrapper) disiWrapper.next) { // check if this doc has match in the subQuery. If not, add score as 0.0 and continue Scorer scorer = disiWrapper.scorer; if (scorer.docID() == DocIdSetIterator.NO_MORE_DOCS) { continue; } - Query query = scorer.getWeight().getQuery(); - int[] indexes = queryToIndex.get(query); - // we need to find the index of first sub-query that hasn't been set yet. Such score will have initial value of "0.0" - int index = -1; - for (int idx : indexes) { - if (Float.compare(scores[idx], 0.0f) == 0) { - index = idx; - break; - } - } - if (index == -1) { - throw new IllegalStateException( - String.format( - Locale.ROOT, - "cannot set score for one of hybrid search subquery [%s] and document [%d]", - query.toString(), - scorer.docID() - ) - ); - } - scores[index] = scorer.score(); + scores[disiWrapper.getSubQueryIndex()] = scorer.score(); } return scores; } - private Map mapQueryToIndex() { - // we need list as number of identical queries is unknown - Map> queryToListOfIndexes = new HashMap<>(); - int idx = 0; - for (Scorer scorer : subScorers) { - if (scorer == null) { - idx++; - continue; - } - Query query = scorer.getWeight().getQuery(); - queryToListOfIndexes.putIfAbsent(query, new ArrayList<>()); - queryToListOfIndexes.get(query).add(idx); - idx++; - } - // convert to the int array for better performance - Map queryToIndex = new HashMap<>(); - queryToListOfIndexes.forEach((key, value) -> queryToIndex.put(key, Ints.toArray(value))); - return queryToIndex; - } - private DisiPriorityQueue initializeSubScorersPQ() { - Objects.requireNonNull(queryToIndex, "should not be null"); Objects.requireNonNull(subScorers, "should not be null"); // we need to count this way in order to include all identical sub-queries - int numOfSubQueries = queryToIndex.values().stream().map(array -> array.length).reduce(0, Integer::sum); + int numOfSubQueries = subScorers.size(); DisiPriorityQueue subScorersPQ = new DisiPriorityQueue(numOfSubQueries); - for (Scorer scorer : subScorers) { + for (int idx = 0; idx < subScorers.size(); idx++) { + Scorer scorer = subScorers.get(idx); if (scorer == null) { continue; } - final DisiWrapper w = new DisiWrapper(scorer); - subScorersPQ.add(w); + final HybridDisiWrapper disiWrapper = new HybridDisiWrapper(scorer, idx); + subScorersPQ.add(disiWrapper); } return subScorersPQ; } diff --git a/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java b/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java new file mode 100644 index 000000000..19bc6565f --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java @@ -0,0 +1,23 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.search; + +import lombok.Getter; +import org.apache.lucene.search.DisiWrapper; +import org.apache.lucene.search.Scorer; + +/** + * Wrapper for DisiWrapper, saves state of sub-queries for performance reasons + */ +@Getter +public class HybridDisiWrapper extends DisiWrapper { + // index of disi wrapper sub-query object when its part of the hybrid query + int subQueryIndex = -1; + + public HybridDisiWrapper(Scorer scorer, int subQueryIndex) { + super(scorer); + this.subQueryIndex = subQueryIndex; + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/search/HybridDisiWrapperTests.java b/src/test/java/org/opensearch/neuralsearch/search/HybridDisiWrapperTests.java new file mode 100644 index 000000000..cd6076290 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/search/HybridDisiWrapperTests.java @@ -0,0 +1,24 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.search; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Scorer; +import org.opensearch.neuralsearch.query.OpenSearchQueryTestCase; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class HybridDisiWrapperTests extends OpenSearchQueryTestCase { + + public void testSubQueryIndex_whenCreateNewInstanceAndSetIndex_thenSuccessful() { + Scorer scorer = mock(Scorer.class); + DocIdSetIterator docIdSetIterator = mock(DocIdSetIterator.class); + when(scorer.iterator()).thenReturn(docIdSetIterator); + int subQueryIndex = 2; + HybridDisiWrapper hybridDisiWrapper = new HybridDisiWrapper(scorer, subQueryIndex); + assertEquals(2, hybridDisiWrapper.getSubQueryIndex()); + } +}