forked from opensearch-project/neural-search
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Part 3] Concurrent segment search bug in Sorting (opensearch-project…
…#808) * Cherry picking Concurrent Segment Search Bug Commit Signed-off-by: Varun Jain <[email protected]> * Fix Concurrent Segment Search Bug in Sorting Signed-off-by: Varun Jain <[email protected]> * Functional Interface Signed-off-by: Varun Jain <[email protected]> * Addressing Martin Comments Signed-off-by: Varun Jain <[email protected]> * Removing comments Signed-off-by: Varun Jain <[email protected]> * Addressing Martin Comments Signed-off-by: Varun Jain <[email protected]> * Addressing Martin Comments Signed-off-by: Varun Jain <[email protected]> * Addressing Martin commnents Signed-off-by: Varun Jain <[email protected]> * Address Martin Comments Signed-off-by: Varun Jain <[email protected]> * Address Martin Comments Signed-off-by: Varun Jain <[email protected]> --------- Signed-off-by: Varun Jain <[email protected]> Co-authored-by: Martin Gaievski <[email protected]>
- Loading branch information
1 parent
d0f870c
commit ded2788
Showing
22 changed files
with
1,837 additions
and
206 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
29 changes: 29 additions & 0 deletions
29
src/main/java/org/opensearch/neuralsearch/search/collector/HybridSearchCollector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
package org.opensearch.neuralsearch.search.collector; | ||
|
||
import java.util.List; | ||
import org.apache.lucene.search.Collector; | ||
import org.apache.lucene.search.TopDocs; | ||
|
||
/** | ||
* Common interface class for Hybrid search collectors | ||
*/ | ||
public interface HybridSearchCollector extends Collector { | ||
/** | ||
* @return List of topDocs which contains topDocs of individual subqueries. | ||
*/ | ||
List<? extends TopDocs> topDocs(); | ||
|
||
/** | ||
* @return count of total hits per shard | ||
*/ | ||
int getTotalHits(); | ||
|
||
/** | ||
* @return maxScore found on a shard | ||
*/ | ||
float getMaxScore(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
161 changes: 95 additions & 66 deletions
161
src/main/java/org/opensearch/neuralsearch/search/query/HybridCollectorManager.java
Large diffs are not rendered by default.
Oops, something went wrong.
57 changes: 57 additions & 0 deletions
57
src/main/java/org/opensearch/neuralsearch/search/query/HybridQueryFieldDocComparator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
package org.opensearch.neuralsearch.search.query; | ||
|
||
import java.util.Comparator; | ||
import lombok.AccessLevel; | ||
import lombok.RequiredArgsConstructor; | ||
import org.apache.lucene.search.FieldComparator; | ||
import org.apache.lucene.search.FieldDoc; | ||
import org.apache.lucene.search.Pruning; | ||
import org.apache.lucene.search.ScoreDoc; | ||
import org.apache.lucene.search.SortField; | ||
|
||
/** | ||
* Comparator class that compares two field docs as per the sorting criteria | ||
*/ | ||
@RequiredArgsConstructor(access = AccessLevel.PACKAGE) | ||
class HybridQueryFieldDocComparator implements Comparator<FieldDoc> { | ||
final SortField[] sortFields; | ||
final FieldComparator<?>[] comparators; | ||
final int[] reverseMul; | ||
final Comparator<ScoreDoc> tieBreaker; | ||
|
||
public HybridQueryFieldDocComparator(SortField[] sortFields, Comparator<ScoreDoc> tieBreaker) { | ||
this.sortFields = sortFields; | ||
this.tieBreaker = tieBreaker; | ||
comparators = new FieldComparator[sortFields.length]; | ||
reverseMul = new int[sortFields.length]; | ||
for (int compIDX = 0; compIDX < sortFields.length; compIDX++) { | ||
final SortField sortField = sortFields[compIDX]; | ||
comparators[compIDX] = sortField.getComparator(1, Pruning.NONE); | ||
reverseMul[compIDX] = sortField.getReverse() ? -1 : 1; | ||
} | ||
} | ||
|
||
@Override | ||
public int compare(final FieldDoc firstFD, final FieldDoc secondFD) { | ||
for (int compIDX = 0; compIDX < comparators.length; compIDX++) { | ||
final FieldComparator comp = comparators[compIDX]; | ||
|
||
final int cmp = reverseMul[compIDX] * comp.compareValues(firstFD.fields[compIDX], secondFD.fields[compIDX]); | ||
|
||
if (cmp != 0) { | ||
return cmp; | ||
} | ||
} | ||
return tieBreakCompare(firstFD, secondFD, tieBreaker); | ||
} | ||
|
||
private int tieBreakCompare(ScoreDoc firstDoc, ScoreDoc secondDoc, Comparator<ScoreDoc> tieBreaker) { | ||
assert tieBreaker != null; | ||
int value = tieBreaker.compare(firstDoc, secondDoc); | ||
return value; | ||
} | ||
} |
103 changes: 103 additions & 0 deletions
103
src/main/java/org/opensearch/neuralsearch/search/query/HybridQueryScoreDocsMerger.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
package org.opensearch.neuralsearch.search.query; | ||
|
||
import lombok.AccessLevel; | ||
import lombok.NoArgsConstructor; | ||
import org.apache.lucene.search.FieldDoc; | ||
import org.apache.lucene.search.ScoreDoc; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Comparator; | ||
import java.util.List; | ||
import java.util.Objects; | ||
|
||
import static org.opensearch.neuralsearch.search.util.HybridSearchResultFormatUtil.isHybridQueryScoreDocElement; | ||
|
||
/** | ||
* Merges two ScoreDoc arrays into one | ||
*/ | ||
@NoArgsConstructor(access = AccessLevel.PACKAGE) | ||
class HybridQueryScoreDocsMerger<T extends ScoreDoc> { | ||
|
||
private static final int MIN_NUMBER_OF_ELEMENTS_IN_SCORE_DOC = 3; | ||
|
||
/** | ||
* Merge two score docs objects, result ScoreDocs[] object will have all hits per sub-query from both original objects. | ||
* Input and output ScoreDocs are in format that is specific to Hybrid Query. This method should not be used for ScoreDocs from | ||
* other query types. | ||
* Logic is based on assumption that hits of every sub-query are sorted by score. | ||
* Method returns new object and doesn't mutate original ScoreDocs arrays. | ||
* @param sourceScoreDocs original score docs from query result | ||
* @param newScoreDocs new score docs that we need to merge into existing scores | ||
* @param comparator comparator to compare the score docs | ||
* @param isSortEnabled flag that show if sort is enabled or disabled | ||
* @return merged array of ScoreDocs objects | ||
*/ | ||
public T[] merge(final T[] sourceScoreDocs, final T[] newScoreDocs, final Comparator<T> comparator, final boolean isSortEnabled) { | ||
if (Objects.requireNonNull(sourceScoreDocs, "score docs cannot be null").length < MIN_NUMBER_OF_ELEMENTS_IN_SCORE_DOC | ||
|| Objects.requireNonNull(newScoreDocs, "score docs cannot be null").length < MIN_NUMBER_OF_ELEMENTS_IN_SCORE_DOC) { | ||
throw new IllegalArgumentException("cannot merge top docs because it does not have enough elements"); | ||
} | ||
// we overshoot and preallocate more than we need - length of both top docs combined. | ||
// we will take only portion of the array at the end | ||
List<T> mergedScoreDocs = new ArrayList<>(sourceScoreDocs.length + newScoreDocs.length); | ||
int sourcePointer = 0; | ||
// mark beginning of hybrid query results by start element | ||
mergedScoreDocs.add(sourceScoreDocs[sourcePointer]); | ||
sourcePointer++; | ||
// new pointer is set to 1 as we don't care about it start-stop element | ||
int newPointer = 1; | ||
|
||
while (sourcePointer < sourceScoreDocs.length - 1 && newPointer < newScoreDocs.length - 1) { | ||
// every iteration is for results of one sub-query | ||
mergedScoreDocs.add(sourceScoreDocs[sourcePointer]); | ||
sourcePointer++; | ||
newPointer++; | ||
// simplest case when both arrays have results for sub-query | ||
while (sourcePointer < sourceScoreDocs.length | ||
&& isHybridQueryScoreDocElement(sourceScoreDocs[sourcePointer]) | ||
&& newPointer < newScoreDocs.length | ||
&& isHybridQueryScoreDocElement(newScoreDocs[newPointer])) { | ||
if (compareCondition(sourceScoreDocs[sourcePointer], newScoreDocs[newPointer], comparator, isSortEnabled)) { | ||
mergedScoreDocs.add(sourceScoreDocs[sourcePointer]); | ||
sourcePointer++; | ||
} else { | ||
mergedScoreDocs.add(newScoreDocs[newPointer]); | ||
newPointer++; | ||
} | ||
} | ||
// at least one object got exhausted at this point, now merge all elements from object that's left | ||
while (sourcePointer < sourceScoreDocs.length && isHybridQueryScoreDocElement(sourceScoreDocs[sourcePointer])) { | ||
mergedScoreDocs.add(sourceScoreDocs[sourcePointer]); | ||
sourcePointer++; | ||
} | ||
while (newPointer < newScoreDocs.length && isHybridQueryScoreDocElement(newScoreDocs[newPointer])) { | ||
mergedScoreDocs.add(newScoreDocs[newPointer]); | ||
newPointer++; | ||
} | ||
} | ||
// mark end of hybrid query results by end element | ||
mergedScoreDocs.add(sourceScoreDocs[sourceScoreDocs.length - 1]); | ||
if (isSortEnabled) { | ||
return mergedScoreDocs.toArray((T[]) new FieldDoc[0]); | ||
} | ||
return mergedScoreDocs.toArray((T[]) new ScoreDoc[0]); | ||
} | ||
|
||
private boolean compareCondition( | ||
final ScoreDoc oldScoreDoc, | ||
final ScoreDoc secondScoreDoc, | ||
final Comparator<T> comparator, | ||
final boolean isSortEnabled | ||
) { | ||
// If sorting is enabled then compare condition will be different then normal HybridQuery | ||
if (isSortEnabled) { | ||
return comparator.compare((T) oldScoreDoc, (T) secondScoreDoc) < 0; | ||
} else { | ||
return comparator.compare((T) oldScoreDoc, (T) secondScoreDoc) >= 0; | ||
} | ||
} | ||
} |
121 changes: 121 additions & 0 deletions
121
src/main/java/org/opensearch/neuralsearch/search/query/TopDocsMerger.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
package org.opensearch.neuralsearch.search.query; | ||
|
||
import com.google.common.annotations.VisibleForTesting; | ||
import lombok.AccessLevel; | ||
import lombok.RequiredArgsConstructor; | ||
import org.apache.lucene.search.FieldDoc; | ||
import org.apache.lucene.search.ScoreDoc; | ||
import org.apache.lucene.search.TopDocs; | ||
import org.apache.lucene.search.TopFieldDocs; | ||
import org.apache.lucene.search.TotalHits; | ||
import org.opensearch.common.lucene.search.TopDocsAndMaxScore; | ||
|
||
import java.util.Comparator; | ||
import java.util.Objects; | ||
import org.opensearch.search.sort.SortAndFormats; | ||
|
||
/** | ||
* Utility class for merging TopDocs and MaxScore across multiple search queries | ||
*/ | ||
@RequiredArgsConstructor(access = AccessLevel.PACKAGE) | ||
class TopDocsMerger { | ||
|
||
private HybridQueryScoreDocsMerger docsMerger; | ||
private SortAndFormats sortAndFormats; | ||
@VisibleForTesting | ||
protected static Comparator<ScoreDoc> SCORE_DOC_BY_SCORE_COMPARATOR; | ||
@VisibleForTesting | ||
protected static HybridQueryFieldDocComparator FIELD_DOC_BY_SORT_CRITERIA_COMPARATOR; | ||
private final Comparator<ScoreDoc> MERGING_TIE_BREAKER = (o1, o2) -> { | ||
int docIdComparison = Integer.compare(o1.doc, o2.doc); | ||
return docIdComparison; | ||
}; | ||
|
||
/** | ||
* Uses hybrid query score docs merger to merge internal score docs | ||
*/ | ||
TopDocsMerger(final SortAndFormats sortAndFormats) { | ||
this.sortAndFormats = sortAndFormats; | ||
if (isSortingEnabled()) { | ||
docsMerger = new HybridQueryScoreDocsMerger<FieldDoc>(); | ||
FIELD_DOC_BY_SORT_CRITERIA_COMPARATOR = new HybridQueryFieldDocComparator(sortAndFormats.sort.getSort(), MERGING_TIE_BREAKER); | ||
} else { | ||
docsMerger = new HybridQueryScoreDocsMerger<>(); | ||
SCORE_DOC_BY_SCORE_COMPARATOR = Comparator.comparing((scoreDoc) -> scoreDoc.score); | ||
} | ||
} | ||
|
||
/** | ||
* Merge TopDocs and MaxScore from multiple search queries into a single TopDocsAndMaxScore object. | ||
* @param source TopDocsAndMaxScore for the original query | ||
* @param newTopDocs TopDocsAndMaxScore for the new query | ||
* @return merged TopDocsAndMaxScore object | ||
*/ | ||
public TopDocsAndMaxScore merge(final TopDocsAndMaxScore source, final TopDocsAndMaxScore newTopDocs) { | ||
if (Objects.isNull(newTopDocs) || Objects.isNull(newTopDocs.topDocs) || newTopDocs.topDocs.totalHits.value == 0) { | ||
return source; | ||
} | ||
TotalHits mergedTotalHits = getMergedTotalHits(source, newTopDocs); | ||
TopDocsAndMaxScore result = new TopDocsAndMaxScore( | ||
getTopDocs(getMergedScoreDocs(source.topDocs.scoreDocs, newTopDocs.topDocs.scoreDocs), mergedTotalHits), | ||
Math.max(source.maxScore, newTopDocs.maxScore) | ||
); | ||
return result; | ||
} | ||
|
||
private TotalHits getMergedTotalHits(final TopDocsAndMaxScore source, final TopDocsAndMaxScore newTopDocs) { | ||
// merged value is a lower bound - if both are equal_to than merged will also be equal_to, | ||
// otherwise assign greater_than_or_equal | ||
TotalHits.Relation mergedHitsRelation = source.topDocs.totalHits.relation == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO | ||
|| newTopDocs.topDocs.totalHits.relation == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO | ||
? TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO | ||
: TotalHits.Relation.EQUAL_TO; | ||
return new TotalHits(source.topDocs.totalHits.value + newTopDocs.topDocs.totalHits.value, mergedHitsRelation); | ||
} | ||
|
||
private TopDocs getTopDocs(ScoreDoc[] mergedScoreDocs, TotalHits mergedTotalHits) { | ||
if (isSortingEnabled()) { | ||
return new TopFieldDocs(mergedTotalHits, mergedScoreDocs, sortAndFormats.sort.getSort()); | ||
} | ||
return new TopDocs(mergedTotalHits, mergedScoreDocs); | ||
} | ||
|
||
private ScoreDoc[] getMergedScoreDocs(ScoreDoc[] source, ScoreDoc[] newScoreDocs) { | ||
// Case 1 when sorting is enabled then below will be the TopDocs format | ||
// we need to merge hits per individual sub-query | ||
// format of results in both new and source TopDocs is following | ||
// doc_id | magic_number_1 | [1] | ||
// doc_id | magic_number_2 | [1] | ||
// ... | ||
// doc_id | magic_number_2 | [1] | ||
// ... | ||
// doc_id | magic_number_2 | [1] | ||
// ... | ||
// doc_id | magic_number_1 | [1] | ||
|
||
// Case 2 when sorting is disabled then below will be the TopDocs format | ||
// we need to merge hits per individual sub-query | ||
// format of results in both new and source TopDocs is following | ||
// doc_id | magic_number_1 | ||
// doc_id | magic_number_2 | ||
// ... | ||
// doc_id | magic_number_2 | ||
// ... | ||
// doc_id | magic_number_2 | ||
// ... | ||
// doc_id | magic_number_1 | ||
return docsMerger.merge(source, newScoreDocs, comparator(), isSortingEnabled()); | ||
} | ||
|
||
private Comparator<? extends ScoreDoc> comparator() { | ||
return sortAndFormats != null ? FIELD_DOC_BY_SORT_CRITERIA_COMPARATOR : SCORE_DOC_BY_SCORE_COMPARATOR; | ||
} | ||
|
||
private boolean isSortingEnabled() { | ||
return sortAndFormats != null; | ||
} | ||
} |
Oops, something went wrong.