-
Notifications
You must be signed in to change notification settings - Fork 674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SOLR-12697 Add pure DocValues support to FieldValueFeature #123
Changes from 29 commits
2ee8779
bdce029
e6601ee
d6e1477
5bc995c
4559415
ec4cbfb
e6f20f1
f16ce3d
e5954eb
da6a635
b105627
e07c432
443a396
2dbd94e
9b77154
c1f3a8e
3c38e91
53cd2fb
e854f50
b9d3cd0
da57e9c
abb3632
a789b12
c42be54
83bc1ee
385d8b2
4348d04
ad489d0
2c3a368
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,16 +23,26 @@ | |
import java.util.Set; | ||
|
||
import org.apache.lucene.document.Document; | ||
import org.apache.lucene.index.DocValues; | ||
import org.apache.lucene.index.DocValuesType; | ||
import org.apache.lucene.index.FieldInfo; | ||
import org.apache.lucene.index.IndexableField; | ||
import org.apache.lucene.index.LeafReaderContext; | ||
import org.apache.lucene.index.NumericDocValues; | ||
import org.apache.lucene.index.SortedDocValues; | ||
import org.apache.lucene.search.DocIdSetIterator; | ||
import org.apache.lucene.search.IndexSearcher; | ||
import org.apache.lucene.search.Query; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.apache.solr.request.SolrQueryRequest; | ||
import org.apache.solr.schema.BoolField; | ||
import org.apache.solr.schema.NumberType; | ||
import org.apache.solr.schema.SchemaField; | ||
import org.apache.solr.search.SolrIndexSearcher; | ||
|
||
/** | ||
* This feature returns the value of a field in the current document | ||
* This feature returns the value of a field in the current document. | ||
* The field must have stored="true" or docValues="true" properties. | ||
* Example configuration: | ||
* <pre>{ | ||
"name": "rawHits", | ||
|
@@ -41,6 +51,17 @@ | |
"field": "hits" | ||
} | ||
}</pre> | ||
* | ||
* <p>There are 4 different types of FeatureScorers that a FieldValueFeatureWeight may use. | ||
* The chosen scorer depends on the field attributes.</p> | ||
* | ||
* <p>FieldValueFeatureScorer (FVFS): used for stored=true, no matter if docValues=true or docValues=false</p> | ||
* | ||
* <p>NumericDocValuesFVFS: used for stored=false and docValues=true, if docValueType == NUMERIC</p> | ||
* <p>SortedDocValuesFVFS: used for stored=false and docValues=true, if docValueType == SORTED | ||
* | ||
* <p>DefaultValueFVFS: used for stored=false and docValues=true, a fallback scorer that is used on segments | ||
* where no document has a value set in the field of this feature</p> | ||
*/ | ||
public class FieldValueFeature extends Feature { | ||
|
||
|
@@ -83,18 +104,52 @@ public FeatureWeight createWeight(IndexSearcher searcher, boolean needsScores, | |
} | ||
|
||
public class FieldValueFeatureWeight extends FeatureWeight { | ||
private final SchemaField schemaField; | ||
|
||
public FieldValueFeatureWeight(IndexSearcher searcher, | ||
SolrQueryRequest request, Query originalQuery, Map<String,String[]> efi) { | ||
super(FieldValueFeature.this, searcher, request, originalQuery, efi); | ||
if (searcher instanceof SolrIndexSearcher) { | ||
schemaField = ((SolrIndexSearcher) searcher).getSchema().getFieldOrNull(field); | ||
} else { // some tests pass a null or a non-SolrIndexSearcher searcher | ||
schemaField = null; | ||
} | ||
} | ||
|
||
/** | ||
* Return a FeatureScorer that uses docValues or storedFields if no docValues are present | ||
* | ||
* @param context the segment this FeatureScorer is working with | ||
* @return FeatureScorer for the current segment and field | ||
* @throws IOException as defined by abstract class Feature | ||
*/ | ||
@Override | ||
public FeatureScorer scorer(LeafReaderContext context) throws IOException { | ||
if (schemaField != null && !schemaField.stored() && schemaField.hasDocValues()) { | ||
|
||
final FieldInfo fieldInfo = context.reader().getFieldInfos().fieldInfo(field); | ||
final DocValuesType docValuesType = fieldInfo != null ? fieldInfo.getDocValuesType() : DocValuesType.NONE; | ||
|
||
if (DocValuesType.NUMERIC.equals(docValuesType)) { | ||
return new NumericDocValuesFieldValueFeatureScorer(this, context, | ||
DocIdSetIterator.all(DocIdSetIterator.NO_MORE_DOCS), schemaField.getType().getNumberType()); | ||
} else if (DocValuesType.SORTED.equals(docValuesType)) { | ||
return new SortedDocValuesFieldValueFeatureScorer(this, context, | ||
DocIdSetIterator.all(DocIdSetIterator.NO_MORE_DOCS)); | ||
} else if (DocValuesType.NONE.equals(docValuesType)) { | ||
// Using a fallback feature scorer because this segment has no documents with a doc value for the current field | ||
return new DefaultValueFieldValueFeatureScorer(this, DocIdSetIterator.all(DocIdSetIterator.NO_MORE_DOCS)); | ||
} | ||
throw new IllegalArgumentException("Doc values type " + docValuesType.name() + " of field " + field | ||
+ " is not supported"); | ||
} | ||
return new FieldValueFeatureScorer(this, context, | ||
DocIdSetIterator.all(DocIdSetIterator.NO_MORE_DOCS)); | ||
} | ||
|
||
/** | ||
* A FeatureScorer that reads the stored value for a field | ||
*/ | ||
public class FieldValueFeatureScorer extends FeatureScorer { | ||
|
||
LeafReaderContext context = null; | ||
|
@@ -146,5 +201,137 @@ public float getMaxScore(int upTo) throws IOException { | |
return Float.POSITIVE_INFINITY; | ||
} | ||
} | ||
|
||
/** | ||
* A FeatureScorer that reads the numeric docValues for a field | ||
*/ | ||
public final class NumericDocValuesFieldValueFeatureScorer extends FeatureScorer { | ||
private final NumericDocValues docValues; | ||
private final NumberType numberType; | ||
|
||
public NumericDocValuesFieldValueFeatureScorer(final FeatureWeight weight, final LeafReaderContext context, | ||
final DocIdSetIterator itr, final NumberType numberType) { | ||
super(weight, itr); | ||
this.numberType = numberType; | ||
|
||
NumericDocValues docValues; | ||
try { | ||
docValues = DocValues.getNumeric(context.reader(), field); | ||
} catch (IOException e) { | ||
throw new IllegalArgumentException("Could not read numeric docValues for field " + field); | ||
} | ||
this.docValues = docValues; | ||
} | ||
|
||
@Override | ||
public float score() throws IOException { | ||
if (docValues.advanceExact(itr.docID())) { | ||
return readNumericDocValues(); | ||
} | ||
return FieldValueFeature.this.getDefaultValue(); | ||
} | ||
|
||
/** | ||
* Read the numeric value for a field and convert the different number types to float. | ||
* | ||
* @return The numeric value that the docValues contain for the current document | ||
* @throws IOException if docValues cannot be read | ||
*/ | ||
private float readNumericDocValues() throws IOException { | ||
if (NumberType.FLOAT.equals(numberType)) { | ||
// convert float value that was stored as long back to float | ||
return Float.intBitsToFloat((int) docValues.longValue()); | ||
} else if (NumberType.DOUBLE.equals(numberType)) { | ||
// handle double value conversion | ||
return (float) Double.longBitsToDouble(docValues.longValue()); | ||
} | ||
// just take the long value | ||
return docValues.longValue(); | ||
} | ||
Comment on lines
+240
to
+250
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1/2 So I was trying to learn and understand better w.r.t. why the Would you have any thoughts on using a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Somewhat strangely so perhaps the use of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm... the date being a numeric type really is quite unintuitive. I will go and make a tea and then look into that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh... Thanks github for not refreshing the page... 🙄 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Likewise, I would have preferred for there to be some existing reusable method and was surprised that there isn't one (or not an obviously findable one). The |
||
|
||
@Override | ||
public float getMaxScore(int upTo) throws IOException { | ||
return Float.POSITIVE_INFINITY; | ||
} | ||
} | ||
|
||
/** | ||
* A FeatureScorer that reads the sorted docValues for a field | ||
*/ | ||
public final class SortedDocValuesFieldValueFeatureScorer extends FeatureScorer { | ||
private final SortedDocValues docValues; | ||
|
||
public SortedDocValuesFieldValueFeatureScorer(final FeatureWeight weight, final LeafReaderContext context, | ||
final DocIdSetIterator itr) { | ||
super(weight, itr); | ||
|
||
SortedDocValues docValues; | ||
try { | ||
docValues = DocValues.getSorted(context.reader(), field); | ||
} catch (IOException e) { | ||
throw new IllegalArgumentException("Could not read sorted docValues for field " + field); | ||
} | ||
this.docValues = docValues; | ||
} | ||
|
||
@Override | ||
public float score() throws IOException { | ||
if (docValues.advanceExact(itr.docID())) { | ||
int ord = docValues.ordValue(); | ||
return readSortedDocValues(docValues.lookupOrd(ord)); | ||
} | ||
return FieldValueFeature.this.getDefaultValue(); | ||
} | ||
|
||
/** | ||
* Interprets the bytesRef either as true / false token or tries to read it as number string | ||
* | ||
* @param bytesRef the value of the field that should be used as score | ||
* @return the input converted to a number | ||
*/ | ||
private float readSortedDocValues(BytesRef bytesRef) { | ||
String string = bytesRef.utf8ToString(); | ||
if (string.length() == 1) { | ||
// boolean values in the index are encoded with the | ||
// a single char contained in TRUE_TOKEN or FALSE_TOKEN | ||
// (see BoolField) | ||
if (string.charAt(0) == BoolField.TRUE_TOKEN[0]) { | ||
return 1; | ||
} | ||
if (string.charAt(0) == BoolField.FALSE_TOKEN[0]) { | ||
return 0; | ||
} | ||
} | ||
return FieldValueFeature.this.getDefaultValue(); | ||
} | ||
|
||
@Override | ||
public float getMaxScore(int upTo) throws IOException { | ||
return Float.POSITIVE_INFINITY; | ||
} | ||
} | ||
|
||
/** | ||
* A FeatureScorer that always returns the default value. | ||
* | ||
* It is used as a fallback for cases when a segment does not have any documents that contain doc values for a field. | ||
* By doing so, we prevent a fallback to the FieldValueFeatureScorer, which would also return the default value but | ||
* in a less performant way because it would first try to read the stored fields for the doc (which aren't present). | ||
*/ | ||
public final class DefaultValueFieldValueFeatureScorer extends FeatureScorer { | ||
public DefaultValueFieldValueFeatureScorer(final FeatureWeight weight, final DocIdSetIterator itr) { | ||
super(weight, itr); | ||
} | ||
|
||
@Override | ||
public float score() throws IOException { | ||
return FieldValueFeature.this.getDefaultValue(); | ||
} | ||
|
||
@Override | ||
public float getMaxScore(int upTo) throws IOException { | ||
return Float.POSITIVE_INFINITY; | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,8 +26,28 @@ | |
<field name="description" type="text_general" indexed="true" stored="true"/> | ||
<field name="keywords" type="text_general" indexed="true" stored="true" multiValued="true"/> | ||
<field name="popularity" type="int" indexed="true" stored="true" /> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. observation: via the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a commit with additional |
||
<field name="dvIntPopularity" type="int" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
<field name="dvLongPopularity" type="long" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
<field name="dvFloatPopularity" type="float" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
<field name="dvDoublePopularity" type="double" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
<field name="dvStringPopularity" type="string" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
<field name="dvStringPopularities" type="string" indexed="false" docValues="true" stored="false" multiValued="true"/> | ||
<field name="normHits" type="float" indexed="true" stored="true" /> | ||
|
||
<field name="isTrendy" type="boolean" indexed="true" stored="true" /> | ||
<field name="dvIsTrendy" type="boolean" indexed="true" stored="false" docValues="true"/> | ||
|
||
<field name="dvIntField" type="int" indexed="false" docValues="true" stored="false" default="-1" multiValued="false"/> | ||
<field name="dvLongField" type="long" indexed="false" docValues="true" stored="false" default="-2" multiValued="false"/> | ||
<field name="dvFloatField" type="float" indexed="false" docValues="true" stored="false" default="-3" multiValued="false"/> | ||
<field name="dvDoubleField" type="double" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
<field name="dvStrNumField" type="string" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
<field name="dvStrBoolField" type="boolean" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
<field name="dvDateField" type="date" indexed="false" docValues="true" stored="false" multiValued="false"/> | ||
|
||
<field name="noDvFloatField" type="float" indexed="false" docValues="false" stored="true" multiValued="false"/> | ||
<field name="noDvStrNumField" type="string" indexed="false" docValues="false" stored="true" multiValued="false"/> | ||
<field name="noDvDateField" type="date" indexed="false" docValues="false" stored="true" multiValued="false"/> | ||
|
||
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/> | ||
<field name="_version_" type="long" indexed="true" stored="true"/> | ||
|
@@ -41,6 +61,13 @@ | |
<copyField source="title" dest="text"/> | ||
<copyField source="description" dest="text"/> | ||
|
||
<copyField source="popularity" dest="dvIntPopularity"/> | ||
<copyField source="popularity" dest="dvLongPopularity"/> | ||
<copyField source="popularity" dest="dvFloatPopularity"/> | ||
<copyField source="popularity" dest="dvDoublePopularity"/> | ||
Comment on lines
+64
to
+67
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. observation: |
||
|
||
<copyField source="isTrendy" dest="dvIsTrendy"/> | ||
|
||
<types> | ||
<fieldType name="string" class="solr.StrField" sortMissingLast="true" /> | ||
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2/2 ... combined with here diverting any unexpected
getNumberType()
enums onto the "is not supported" code path? Although having said that, I don't know what the existingFieldValueFeatureScorer
would return for a storedDATE
field i.e. perhaps back compat considerations apply, hmm.