From 2c63f8a38f4070b9625df5200ba268dfb8c28277 Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Fri, 26 Apr 2024 12:29:29 -0700 Subject: [PATCH 1/3] Removed map of subquery to subquery index in favor of storing index as part of disi wrapper to improve hybrid query latencies by 20% Signed-off-by: Martin Gaievski --- CHANGELOG.md | 1 + .../neuralsearch/query/HybridQueryScorer.java | 65 +++---------------- .../search/HybridDisiWrapper.java | 23 +++++++ .../search/HybridDisiWrapperTests.java | 24 +++++++ 4 files changed, 57 insertions(+), 56 deletions(-) create mode 100644 src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java create mode 100644 src/test/java/org/opensearch/neuralsearch/search/HybridDisiWrapperTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 683fda670..2fea42b6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Allowing execution of hybrid query on index alias with filters ([#670](https://github.com/opensearch-project/neural-search/pull/670)) - Allowing query by raw tokens in neural_sparse query ([#693](https://github.com/opensearch-project/neural-search/pull/693)) - Removed stream.findFirst implementation to use more native iteration implement to improve hybrid query latencies by 35% ([#706](https://github.com/opensearch-project/neural-search/pull/706)) +- Removed map of subquery to subquery index in favor of storing index as part of disi wrapper to improve hybrid query latencies by 20% ([#711](https://github.com/opensearch-project/neural-search/pull/711)) ### Bug Fixes - Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663)) ### Infrastructure diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java index 1da610f53..32029d8c5 100644 --- a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java +++ b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java @@ -8,18 +8,13 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.HashMap; import java.util.List; -import java.util.Locale; -import java.util.Map; import java.util.Objects; -import com.google.common.primitives.Ints; import org.apache.lucene.search.DisiPriorityQueue; import org.apache.lucene.search.DisiWrapper; import org.apache.lucene.search.DisjunctionDISIApproximation; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; @@ -27,6 +22,7 @@ import lombok.Getter; import org.apache.lucene.util.PriorityQueue; +import org.opensearch.neuralsearch.search.HybridDisiWrapper; /** * Class abstracts functionality of Scorer for hybrid query. When iterating over documents in increasing @@ -43,8 +39,6 @@ public final class HybridQueryScorer extends Scorer { private final float[] subScores; - private final Map queryToIndex; - private final DocIdSetIterator approximation; private final HybridScoreBlockBoundaryPropagator disjunctionBlockPropagator; private final TwoPhase twoPhase; @@ -57,7 +51,6 @@ public HybridQueryScorer(final Weight weight, final List subScorers) thr super(weight); this.subScorers = Collections.unmodifiableList(subScorers); subScores = new float[subScorers.size()]; - this.queryToIndex = mapQueryToIndex(); this.subScorersPQ = initializeSubScorersPQ(); boolean needsScores = scoreMode != ScoreMode.COMPLETE_NO_SCORES; @@ -194,70 +187,30 @@ public int docID() { */ public float[] hybridScores() throws IOException { float[] scores = new float[subScores.length]; - DisiWrapper topList = subScorersPQ.topList(); - for (DisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = disiWrapper.next) { + HybridDisiWrapper topList = (HybridDisiWrapper) subScorersPQ.topList(); + for (HybridDisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = (HybridDisiWrapper) disiWrapper.next) { // check if this doc has match in the subQuery. If not, add score as 0.0 and continue Scorer scorer = disiWrapper.scorer; if (scorer.docID() == DocIdSetIterator.NO_MORE_DOCS) { continue; } - Query query = scorer.getWeight().getQuery(); - int[] indexes = queryToIndex.get(query); - // we need to find the index of first sub-query that hasn't been set yet. Such score will have initial value of "0.0" - int index = -1; - for (int idx : indexes) { - if (Float.compare(scores[idx], 0.0f) == 0) { - index = idx; - break; - } - } - if (index == -1) { - throw new IllegalStateException( - String.format( - Locale.ROOT, - "cannot set score for one of hybrid search subquery [%s] and document [%d]", - query.toString(), - scorer.docID() - ) - ); - } - scores[index] = scorer.score(); + scores[disiWrapper.getSubQueryIndex()] = scorer.score(); } return scores; } - private Map mapQueryToIndex() { - // we need list as number of identical queries is unknown - Map> queryToListOfIndexes = new HashMap<>(); - int idx = 0; - for (Scorer scorer : subScorers) { - if (scorer == null) { - idx++; - continue; - } - Query query = scorer.getWeight().getQuery(); - queryToListOfIndexes.putIfAbsent(query, new ArrayList<>()); - queryToListOfIndexes.get(query).add(idx); - idx++; - } - // convert to the int array for better performance - Map queryToIndex = new HashMap<>(); - queryToListOfIndexes.forEach((key, value) -> queryToIndex.put(key, Ints.toArray(value))); - return queryToIndex; - } - private DisiPriorityQueue initializeSubScorersPQ() { - Objects.requireNonNull(queryToIndex, "should not be null"); Objects.requireNonNull(subScorers, "should not be null"); // we need to count this way in order to include all identical sub-queries - int numOfSubQueries = queryToIndex.values().stream().map(array -> array.length).reduce(0, Integer::sum); + int numOfSubQueries = subScorers.size(); DisiPriorityQueue subScorersPQ = new DisiPriorityQueue(numOfSubQueries); - for (Scorer scorer : subScorers) { + for (int idx = 0; idx < subScorers.size(); idx++) { + Scorer scorer = subScorers.get(idx); if (scorer == null) { continue; } - final DisiWrapper w = new DisiWrapper(scorer); - subScorersPQ.add(w); + final HybridDisiWrapper disiWrapper = new HybridDisiWrapper(scorer, idx); + subScorersPQ.add(disiWrapper); } return subScorersPQ; } diff --git a/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java b/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java new file mode 100644 index 000000000..19bc6565f --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java @@ -0,0 +1,23 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.search; + +import lombok.Getter; +import org.apache.lucene.search.DisiWrapper; +import org.apache.lucene.search.Scorer; + +/** + * Wrapper for DisiWrapper, saves state of sub-queries for performance reasons + */ +@Getter +public class HybridDisiWrapper extends DisiWrapper { + // index of disi wrapper sub-query object when its part of the hybrid query + int subQueryIndex = -1; + + public HybridDisiWrapper(Scorer scorer, int subQueryIndex) { + super(scorer); + this.subQueryIndex = subQueryIndex; + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/search/HybridDisiWrapperTests.java b/src/test/java/org/opensearch/neuralsearch/search/HybridDisiWrapperTests.java new file mode 100644 index 000000000..cd6076290 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/search/HybridDisiWrapperTests.java @@ -0,0 +1,24 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.neuralsearch.search; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Scorer; +import org.opensearch.neuralsearch.query.OpenSearchQueryTestCase; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class HybridDisiWrapperTests extends OpenSearchQueryTestCase { + + public void testSubQueryIndex_whenCreateNewInstanceAndSetIndex_thenSuccessful() { + Scorer scorer = mock(Scorer.class); + DocIdSetIterator docIdSetIterator = mock(DocIdSetIterator.class); + when(scorer.iterator()).thenReturn(docIdSetIterator); + int subQueryIndex = 2; + HybridDisiWrapper hybridDisiWrapper = new HybridDisiWrapper(scorer, subQueryIndex); + assertEquals(2, hybridDisiWrapper.getSubQueryIndex()); + } +} From dec9a061c08809f432edec12fcc1176b4ff85918 Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Fri, 26 Apr 2024 15:18:00 -0700 Subject: [PATCH 2/3] Adding type check before cast to HybridDISIWrapper Signed-off-by: Martin Gaievski --- .../neuralsearch/query/HybridQueryScorer.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java index 32029d8c5..657046151 100644 --- a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java +++ b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java @@ -9,8 +9,10 @@ import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.Locale; import java.util.Objects; +import lombok.extern.log4j.Log4j2; import org.apache.lucene.search.DisiPriorityQueue; import org.apache.lucene.search.DisiWrapper; import org.apache.lucene.search.DisjunctionDISIApproximation; @@ -29,6 +31,7 @@ * order of doc id, this class fills up array of scores per sub-query for each doc id. Order in array of scores * corresponds to order of sub-queries in an input Hybrid query. */ +@Log4j2 public final class HybridQueryScorer extends Scorer { // score for each of sub-query in this hybrid query @@ -187,6 +190,17 @@ public int docID() { */ public float[] hybridScores() throws IOException { float[] scores = new float[subScores.length]; + if (subScorersPQ.topList() instanceof HybridDisiWrapper == false) { + log.error( + String.format( + Locale.ROOT, + "Unexpected type of DISI wrapper, expected [%s] but found [%s]", + HybridDisiWrapper.class.getSimpleName(), + subScorersPQ.topList().getClass().getSimpleName() + ) + ); + throw new IllegalStateException(); + } HybridDisiWrapper topList = (HybridDisiWrapper) subScorersPQ.topList(); for (HybridDisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = (HybridDisiWrapper) disiWrapper.next) { // check if this doc has match in the subQuery. If not, add score as 0.0 and continue From cc8998927d0e99fb6bb9dda43b94632870f96a45 Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Fri, 26 Apr 2024 15:53:02 -0700 Subject: [PATCH 3/3] Making subquery index private var final Signed-off-by: Martin Gaievski --- .../neuralsearch/query/HybridQueryScorer.java | 11 +++++++---- .../neuralsearch/search/HybridDisiWrapper.java | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java index 657046151..5afd43917 100644 --- a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java +++ b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java @@ -190,7 +190,8 @@ public int docID() { */ public float[] hybridScores() throws IOException { float[] scores = new float[subScores.length]; - if (subScorersPQ.topList() instanceof HybridDisiWrapper == false) { + DisiWrapper topList = subScorersPQ.topList(); + if (topList instanceof HybridDisiWrapper == false) { log.error( String.format( Locale.ROOT, @@ -199,10 +200,12 @@ public float[] hybridScores() throws IOException { subScorersPQ.topList().getClass().getSimpleName() ) ); - throw new IllegalStateException(); + throw new IllegalStateException( + "Unable to collect scores for one of the sub-queries, encountered an unexpected type of score iterator." + ); } - HybridDisiWrapper topList = (HybridDisiWrapper) subScorersPQ.topList(); - for (HybridDisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = (HybridDisiWrapper) disiWrapper.next) { + for (HybridDisiWrapper disiWrapper = (HybridDisiWrapper) topList; disiWrapper != null; disiWrapper = + (HybridDisiWrapper) disiWrapper.next) { // check if this doc has match in the subQuery. If not, add score as 0.0 and continue Scorer scorer = disiWrapper.scorer; if (scorer.docID() == DocIdSetIterator.NO_MORE_DOCS) { diff --git a/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java b/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java index 19bc6565f..7165ce055 100644 --- a/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java +++ b/src/main/java/org/opensearch/neuralsearch/search/HybridDisiWrapper.java @@ -14,7 +14,7 @@ @Getter public class HybridDisiWrapper extends DisiWrapper { // index of disi wrapper sub-query object when its part of the hybrid query - int subQueryIndex = -1; + private final int subQueryIndex; public HybridDisiWrapper(Scorer scorer, int subQueryIndex) { super(scorer);