From 0427339ab04912d51c8bb53cc3a9dbda317f3dd2 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 4 Jun 2018 08:50:35 +0100 Subject: [PATCH] Index phrases (#30450) Specifying `index_phrases: true` on a text field mapping will add a subsidiary [field]._index_phrase field, indexing two-term shingles from the parent field. The parent analysis chain is re-used, wrapped with a FixedShingleFilter. At query time, if a phrase match query is executed, the mapping will redirect it to run against the subsidiary field. This should trade faster phrase querying for a larger index and longer indexing times. Relates to #27049 --- docs/reference/mapping/types/text.asciidoc | 8 + .../test/search/200_index_phrase_search.yml | 67 +++++ .../index/mapper/MappedFieldType.java | 10 + .../index/mapper/TextFieldMapper.java | 235 +++++++++++++++++- .../index/query/MatchPhraseQueryBuilder.java | 1 + .../index/search/MatchQuery.java | 31 ++- .../index/mapper/TextFieldMapperTests.java | 112 ++++++++- .../index/mapper/TextFieldTypeTests.java | 7 + 8 files changed, 457 insertions(+), 14 deletions(-) create mode 100644 rest-api-spec/src/main/resources/rest-api-spec/test/search/200_index_phrase_search.yml diff --git a/docs/reference/mapping/types/text.asciidoc b/docs/reference/mapping/types/text.asciidoc index 988a2ada38d7e..fd5bb312ef15c 100644 --- a/docs/reference/mapping/types/text.asciidoc +++ b/docs/reference/mapping/types/text.asciidoc @@ -96,6 +96,14 @@ The following parameters are accepted by `text` fields: the expense of a larger index. Accepts an <> +<>:: + + If enabled, two-term word combinations ('shingles') are indexed into a separate + field. This allows exact phrase queries to run more efficiently, at the expense + of a larger index. Note that this works best when stopwords are not removed, + as phrases containing stopwords will not use the subsidiary field and will fall + back to a standard phrase query. Accepts `true` or `false` (default). + <>:: Whether field-length should be taken into account when scoring queries. diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/200_index_phrase_search.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/200_index_phrase_search.yml new file mode 100644 index 0000000000000..241fbc187dec6 --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search/200_index_phrase_search.yml @@ -0,0 +1,67 @@ +--- +"search with indexed phrases": + - skip: + version: " - 6.99.99" + reason: index_phrase is only available as of 7.0.0 + - do: + indices.create: + index: test + body: + mappings: + test: + properties: + text: + type: text + index_phrases: true + + - do: + index: + index: test + type: test + id: 1 + body: { text: "peter piper picked a peck of pickled peppers" } + + - do: + indices.refresh: + index: [test] + + - do: + search: + index: test + body: + query: + match_phrase: + text: + query: "peter piper" + + - match: {hits.total: 1} + + - do: + search: + index: test + q: '"peter piper"~1' + df: text + + - match: {hits.total: 1} + + - do: + search: + index: test + body: + query: + match_phrase: + text: "peter piper picked" + + - match: {hits.total: 1} + + - do: + search: + index: test + body: + query: + match_phrase: + text: "piper" + + - match: {hits.total: 1} + + diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java index facf669cb5d02..71450e69948fb 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; @@ -43,6 +44,7 @@ import org.elasticsearch.index.query.QueryRewriteContext; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.QueryShardException; +import org.elasticsearch.index.search.MatchQuery; import org.elasticsearch.index.similarity.SimilarityProvider; import org.elasticsearch.search.DocValueFormat; import org.joda.time.DateTimeZone; @@ -353,6 +355,14 @@ public Query regexpQuery(String value, int flags, int maxDeterminizedStates, @Nu public abstract Query existsQuery(QueryShardContext context); + public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException { + throw new IllegalArgumentException("Can only use phrase queries on text fields - not on [" + name + "] which is of type [" + typeName() + "]"); + } + + public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException { + throw new IllegalArgumentException("Can only use phrase queries on text fields - not on [" + name + "] which is of type [" + typeName() + "]"); + } + /** * An enum used to describe the relation between the range of terms in a * shard when compared with a query range diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index 9e2063adb144f..d2ba5fbc0c2d1 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -19,20 +19,29 @@ package org.elasticsearch.index.mapper; +import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.AnalyzerWrapper; +import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; +import org.apache.lucene.analysis.shingle.FixedShingleFilter; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.NormsFieldExistsQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.elasticsearch.common.collect.Iterators; +import org.elasticsearch.common.logging.ESLoggerFactory; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.support.XContentMapValues; @@ -43,7 +52,7 @@ import org.elasticsearch.index.query.QueryShardContext; import java.io.IOException; -import java.util.Collections; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -54,9 +63,13 @@ /** A {@link FieldMapper} for full-text fields. */ public class TextFieldMapper extends FieldMapper { + private static final Logger logger = ESLoggerFactory.getLogger(TextFieldMapper.class); + public static final String CONTENT_TYPE = "text"; private static final int POSITION_INCREMENT_GAP_USE_ANALYZER = -1; + public static final String FAST_PHRASE_SUFFIX = "._index_phrase"; + public static class Defaults { public static final double FIELDDATA_MIN_FREQUENCY = 0; public static final double FIELDDATA_MAX_FREQUENCY = Integer.MAX_VALUE; @@ -105,6 +118,11 @@ public Builder fielddata(boolean fielddata) { return builder; } + public Builder indexPhrases(boolean indexPhrases) { + fieldType().setIndexPhrases(indexPhrases); + return builder; + } + @Override public Builder docValues(boolean docValues) { if (docValues) { @@ -166,8 +184,16 @@ public TextFieldMapper build(BuilderContext context) { prefixFieldType.setAnalyzer(fieldType.indexAnalyzer()); prefixMapper = new PrefixFieldMapper(prefixFieldType, context.indexSettings()); } + if (fieldType().indexPhrases) { + if (fieldType().isSearchable() == false) { + throw new IllegalArgumentException("Cannot set index_phrases on unindexed field [" + name() + "]"); + } + if (fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + throw new IllegalArgumentException("Cannot set index_phrases on field [" + name() + "] if positions are not enabled"); + } + } return new TextFieldMapper( - name, fieldType, defaultFieldType, positionIncrementGap, prefixMapper, + name, fieldType(), defaultFieldType, positionIncrementGap, prefixMapper, context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); } } @@ -211,12 +237,35 @@ public Mapper.Builder parse(String fieldName, Map node, ParserCo builder.indexPrefixes(minChars, maxChars); DocumentMapperParser.checkNoRemainingFields(propName, indexPrefix, parserContext.indexVersionCreated()); iterator.remove(); + } else if (propName.equals("index_phrases")) { + builder.indexPhrases(XContentMapValues.nodeBooleanValue(propNode, "index_phrases")); + iterator.remove(); } } return builder; } } + private static class PhraseWrappedAnalyzer extends AnalyzerWrapper { + + private final Analyzer delegate; + + PhraseWrappedAnalyzer(Analyzer delegate) { + super(delegate.getReuseStrategy()); + this.delegate = delegate; + } + + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return delegate; + } + + @Override + protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { + return new TokenStreamComponents(components.getTokenizer(), new FixedShingleFilter(components.getTokenStream(), 2)); + } + } + private static class PrefixWrappedAnalyzer extends AnalyzerWrapper { private final int minChars; @@ -242,6 +291,46 @@ protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComp } } + private static final class PhraseFieldType extends StringFieldType { + + final TextFieldType parent; + + PhraseFieldType(TextFieldType parent) { + setTokenized(true); + setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + if (parent.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { + setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + } + if (parent.storeTermVectorOffsets()) { + setStoreTermVectors(true); + setStoreTermVectorPositions(true); + setStoreTermVectorOffsets(true); + } + setAnalyzer(parent.indexAnalyzer().name(), parent.indexAnalyzer().analyzer()); + setName(parent.name() + FAST_PHRASE_SUFFIX); + this.parent = parent; + } + + void setAnalyzer(String name, Analyzer delegate) { + setIndexAnalyzer(new NamedAnalyzer(name, AnalyzerScope.INDEX, new PhraseWrappedAnalyzer(delegate))); + } + + @Override + public MappedFieldType clone() { + return new PhraseFieldType(parent); + } + + @Override + public String typeName() { + return "phrase"; + } + + @Override + public Query existsQuery(QueryShardContext context) { + throw new UnsupportedOperationException(); + } + } + static final class PrefixFieldType extends StringFieldType { final int minChars; @@ -310,6 +399,23 @@ public int hashCode() { } } + private static final class PhraseFieldMapper extends FieldMapper { + + PhraseFieldMapper(PhraseFieldType fieldType, Settings indexSettings) { + super(fieldType.name(), fieldType, fieldType, indexSettings, MultiFields.empty(), CopyTo.empty()); + } + + @Override + protected void parseCreateField(ParseContext context, List fields) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + protected String contentType() { + return "phrase"; + } + } + private static final class PrefixFieldMapper extends FieldMapper { protected PrefixFieldMapper(PrefixFieldType fieldType, Settings indexSettings) { @@ -343,6 +449,7 @@ public static final class TextFieldType extends StringFieldType { private double fielddataMaxFrequency; private int fielddataMinSegmentSize; private PrefixFieldType prefixFieldType; + private boolean indexPhrases = false; public TextFieldType() { setTokenized(true); @@ -358,6 +465,7 @@ protected TextFieldType(TextFieldType ref) { this.fielddataMinFrequency = ref.fielddataMinFrequency; this.fielddataMaxFrequency = ref.fielddataMaxFrequency; this.fielddataMinSegmentSize = ref.fielddataMinSegmentSize; + this.indexPhrases = ref.indexPhrases; if (ref.prefixFieldType != null) { this.prefixFieldType = ref.prefixFieldType.clone(); } @@ -374,6 +482,7 @@ public boolean equals(Object o) { } TextFieldType that = (TextFieldType) o; return fielddata == that.fielddata + && indexPhrases == that.indexPhrases && Objects.equals(prefixFieldType, that.prefixFieldType) && fielddataMinFrequency == that.fielddataMinFrequency && fielddataMaxFrequency == that.fielddataMaxFrequency @@ -382,7 +491,7 @@ public boolean equals(Object o) { @Override public int hashCode() { - return Objects.hash(super.hashCode(), fielddata, prefixFieldType, + return Objects.hash(super.hashCode(), fielddata, indexPhrases, prefixFieldType, fielddataMinFrequency, fielddataMaxFrequency, fielddataMinSegmentSize); } @@ -427,6 +536,11 @@ void setPrefixFieldType(PrefixFieldType prefixFieldType) { this.prefixFieldType = prefixFieldType; } + void setIndexPhrases(boolean indexPhrases) { + checkIfFrozen(); + this.indexPhrases = indexPhrases; + } + public PrefixFieldType getPrefixFieldType() { return this.prefixFieldType; } @@ -458,6 +572,93 @@ public Query existsQuery(QueryShardContext context) { } } + @Override + public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePosIncrements) throws IOException { + + if (indexPhrases && slop == 0 && hasGaps(cache(stream)) == false) { + stream = new FixedShingleFilter(stream, 2); + field = field + FAST_PHRASE_SUFFIX; + } + PhraseQuery.Builder builder = new PhraseQuery.Builder(); + builder.setSlop(slop); + + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + int position = -1; + + stream.reset(); + while (stream.incrementToken()) { + if (enablePosIncrements) { + position += posIncrAtt.getPositionIncrement(); + } + else { + position += 1; + } + builder.add(new Term(field, termAtt.getBytesRef()), position); + } + + return builder.build(); + } + + @Override + public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException { + + if (indexPhrases && slop == 0 && hasGaps(cache(stream)) == false) { + stream = new FixedShingleFilter(stream, 2); + field = field + FAST_PHRASE_SUFFIX; + } + + MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder(); + mpqb.setSlop(slop); + + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + + PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + int position = -1; + + List multiTerms = new ArrayList<>(); + stream.reset(); + while (stream.incrementToken()) { + int positionIncrement = posIncrAtt.getPositionIncrement(); + + if (positionIncrement > 0 && multiTerms.size() > 0) { + if (enablePositionIncrements) { + mpqb.add(multiTerms.toArray(new Term[0]), position); + } else { + mpqb.add(multiTerms.toArray(new Term[0])); + } + multiTerms.clear(); + } + position += positionIncrement; + multiTerms.add(new Term(field, termAtt.getBytesRef())); + } + + if (enablePositionIncrements) { + mpqb.add(multiTerms.toArray(new Term[0]), position); + } else { + mpqb.add(multiTerms.toArray(new Term[0])); + } + return mpqb.build(); + } + + private static CachingTokenFilter cache(TokenStream in) { + if (in instanceof CachingTokenFilter) { + return (CachingTokenFilter) in; + } + return new CachingTokenFilter(in); + } + + private static boolean hasGaps(CachingTokenFilter stream) throws IOException { + PositionIncrementAttribute posIncAtt = stream.getAttribute(PositionIncrementAttribute.class); + stream.reset(); + while (stream.incrementToken()) { + if (posIncAtt.getPositionIncrement() > 1) { + return true; + } + } + return false; + } + @Override public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { if (fielddata == false) { @@ -472,6 +673,9 @@ public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { public void checkCompatibility(MappedFieldType other, List conflicts) { super.checkCompatibility(other, conflicts); TextFieldType tft = (TextFieldType) other; + if (tft.indexPhrases != this.indexPhrases) { + conflicts.add("mapper [" + name() + "] has different [index_phrases] values"); + } if (Objects.equals(this.prefixFieldType, tft.prefixFieldType) == false) { if (this.prefixFieldType == null) { conflicts.add("mapper [" + name() @@ -490,8 +694,9 @@ else if (tft.prefixFieldType == null) { private int positionIncrementGap; private PrefixFieldMapper prefixFieldMapper; + private PhraseFieldMapper phraseFieldMapper; - protected TextFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, + protected TextFieldMapper(String simpleName, TextFieldType fieldType, MappedFieldType defaultFieldType, int positionIncrementGap, PrefixFieldMapper prefixFieldMapper, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); @@ -502,6 +707,7 @@ protected TextFieldMapper(String simpleName, MappedFieldType fieldType, MappedFi } this.positionIncrementGap = positionIncrementGap; this.prefixFieldMapper = prefixFieldMapper; + this.phraseFieldMapper = fieldType.indexPhrases ? new PhraseFieldMapper(new PhraseFieldType(fieldType), indexSettings) : null; } @Override @@ -535,15 +741,25 @@ protected void parseCreateField(ParseContext context, List field if (prefixFieldMapper != null) { prefixFieldMapper.addField(value, fields); } + if (phraseFieldMapper != null) { + fields.add(new Field(phraseFieldMapper.fieldType.name(), value, phraseFieldMapper.fieldType)); + } } } @Override public Iterator iterator() { - if (prefixFieldMapper == null) { + List subIterators = new ArrayList<>(); + if (prefixFieldMapper != null) { + subIterators.add(prefixFieldMapper); + } + if (phraseFieldMapper != null) { + subIterators.add(phraseFieldMapper); + } + if (subIterators.size() == 0) { return super.iterator(); } - return Iterators.concat(super.iterator(), Collections.singleton(prefixFieldMapper).iterator()); + return Iterators.concat(super.iterator(), subIterators.iterator()); } @Override @@ -562,6 +778,10 @@ else if (this.prefixFieldMapper != null || mw.prefixFieldMapper != null) { throw new IllegalArgumentException("mapper [" + name() + "] has different index_prefix settings, current [" + this.prefixFieldMapper + "], merged [" + mw.prefixFieldMapper + "]"); } + else if (this.fieldType().indexPhrases != mw.fieldType().indexPhrases) { + throw new IllegalArgumentException("mapper [" + name() + "] has different index_phrases settings, current [" + + this.fieldType().indexPhrases + "], merged [" + mw.fieldType().indexPhrases + "]"); + } } @Override @@ -602,5 +822,8 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, if (fieldType().prefixFieldType != null) { fieldType().prefixFieldType.doXContent(builder); } + if (fieldType().indexPhrases) { + builder.field("index_phrases", fieldType().indexPhrases); + } } } diff --git a/server/src/main/java/org/elasticsearch/index/query/MatchPhraseQueryBuilder.java b/server/src/main/java/org/elasticsearch/index/query/MatchPhraseQueryBuilder.java index 7dc01bb34503d..4639b8df8e539 100644 --- a/server/src/main/java/org/elasticsearch/index/query/MatchPhraseQueryBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/MatchPhraseQueryBuilder.java @@ -28,6 +28,7 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.search.MatchQuery; import org.elasticsearch.index.search.MatchQuery.ZeroTermsQuery; diff --git a/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java b/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java index 7765be215aa7f..43c842a1697e6 100644 --- a/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java +++ b/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java @@ -352,16 +352,14 @@ protected Query newSynonymQuery(Term[] terms) { @Override protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { - if (hasPositions(mapper) == false) { - IllegalStateException exc = - new IllegalStateException("field:[" + field + "] was indexed without position data; cannot run PhraseQuery"); + IllegalStateException e = checkForPositions(field); + if (e != null) { if (lenient) { - return newLenientFieldQuery(field, exc); - } else { - throw exc; + return newLenientFieldQuery(field, e); } + throw e; } - Query query = super.analyzePhrase(field, stream, slop); + Query query = mapper.phraseQuery(field, stream, slop, enablePositionIncrements); if (query instanceof PhraseQuery) { // synonyms that expand to multiple terms can return a phrase query. return blendPhraseQuery((PhraseQuery) query, mapper); @@ -369,6 +367,25 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws return query; } + @Override + protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException { + IllegalStateException e = checkForPositions(field); + if (e != null) { + if (lenient) { + return newLenientFieldQuery(field, e); + } + throw e; + } + return mapper.multiPhraseQuery(field, stream, slop, enablePositionIncrements); + } + + private IllegalStateException checkForPositions(String field) { + if (hasPositions(mapper) == false) { + return new IllegalStateException("field:[" + field + "] was indexed without position data; cannot run PhraseQuery"); + } + return null; + } + /** * Checks if graph analysis should be enabled for the field depending * on the provided {@link Analyzer} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java index 772762997fad6..b7da270a15ab3 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldMapperTests.java @@ -19,6 +19,8 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; @@ -29,6 +31,8 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; @@ -38,6 +42,7 @@ import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.lucene.uid.Versions; +import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; @@ -47,7 +52,9 @@ import org.elasticsearch.index.engine.Engine; import org.elasticsearch.index.mapper.MapperService.MergeReason; import org.elasticsearch.index.mapper.TextFieldMapper.TextFieldType; +import org.elasticsearch.index.query.MatchPhraseQueryBuilder; import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.index.search.MatchQuery; import org.elasticsearch.index.shard.IndexShard; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; @@ -65,6 +72,7 @@ import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.core.Is.is; public class TextFieldMapperTests extends ESSingleNodeTestCase { @@ -73,7 +81,13 @@ public class TextFieldMapperTests extends ESSingleNodeTestCase { @Before public void setup() { - indexService = createIndex("test"); + Settings settings = Settings.builder() + .put("index.analysis.filter.mySynonyms.type", "synonym") + .putList("index.analysis.filter.mySynonyms.synonyms", Collections.singletonList("car, auto")) + .put("index.analysis.analyzer.synonym.tokenizer", "standard") + .put("index.analysis.analyzer.synonym.filter", "mySynonyms") + .build(); + indexService = createIndex("test", settings); parser = indexService.mapperService().documentMapperParser(); } @@ -670,6 +684,102 @@ public void testIndexPrefixIndexTypes() throws IOException { } } + public void testFastPhraseMapping() throws IOException { + + QueryShardContext queryShardContext = indexService.newQueryShardContext( + randomInt(20), null, () -> { + throw new UnsupportedOperationException(); + }, null); + + String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties") + .startObject("field") + .field("type", "text") + .field("analyzer", "english") + .field("index_phrases", true) + .endObject() + .startObject("synfield") + .field("type", "text") + .field("analyzer", "synonym") + .field("index_phrases", true) + .endObject() + .endObject() + .endObject().endObject()); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + + queryShardContext.getMapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE); + + Query q = new MatchPhraseQueryBuilder("field", "two words").toQuery(queryShardContext); + assertThat(q, is(new PhraseQuery("field._index_phrase", "two word"))); + + Query q2 = new MatchPhraseQueryBuilder("field", "three words here").toQuery(queryShardContext); + assertThat(q2, is(new PhraseQuery("field._index_phrase", "three word", "word here"))); + + Query q3 = new MatchPhraseQueryBuilder("field", "two words").slop(1).toQuery(queryShardContext); + assertThat(q3, is(new PhraseQuery(1, "field", "two", "word"))); + + Query q4 = new MatchPhraseQueryBuilder("field", "singleton").toQuery(queryShardContext); + assertThat(q4, is(new TermQuery(new Term("field", "singleton")))); + + Query q5 = new MatchPhraseQueryBuilder("field", "sparkle a stopword").toQuery(queryShardContext); + assertThat(q5, + is(new PhraseQuery.Builder().add(new Term("field", "sparkl")).add(new Term("field", "stopword"), 2).build())); + + Query q6 = new MatchPhraseQueryBuilder("synfield", "motor car").toQuery(queryShardContext); + assertThat(q6, is(new MultiPhraseQuery.Builder() + .add(new Term[]{ + new Term("synfield._index_phrase", "motor car"), + new Term("synfield._index_phrase", "motor auto")}) + .build())); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .field("field", "Some English text that is going to be very useful") + .endObject()), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field._index_phrase"); + assertEquals(1, fields.length); + + try (TokenStream ts = fields[0].tokenStream(queryShardContext.getMapperService().indexAnalyzer(), null)) { + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + assertTrue(ts.incrementToken()); + assertEquals("some english", termAtt.toString()); + } + + { + String badConfigMapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", "text") + .field("index", "false") + .field("index_phrases", true) + .endObject().endObject() + .endObject().endObject()); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> parser.parse("type", new CompressedXContent(badConfigMapping)) + ); + assertThat(e.getMessage(), containsString("Cannot set index_phrases on unindexed field [field]")); + } + + { + String badConfigMapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", "text") + .field("index_options", "freqs") + .field("index_phrases", true) + .endObject().endObject() + .endObject().endObject()); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> parser.parse("type", new CompressedXContent(badConfigMapping)) + ); + assertThat(e.getMessage(), containsString("Cannot set index_phrases on field [field] if positions are not enabled")); + } + } + public void testIndexPrefixMapping() throws IOException { QueryShardContext queryShardContext = indexService.newQueryShardContext( diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java index d0eacfad44056..877553bacf919 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java @@ -68,6 +68,13 @@ public void modify(MappedFieldType ft) { tft.setFielddataMinSegmentSize(1000); } }); + addModifier(new Modifier("index_phrases", false) { + @Override + public void modify(MappedFieldType ft) { + TextFieldMapper.TextFieldType tft = (TextFieldMapper.TextFieldType) ft; + tft.setIndexPhrases(true); + } + }); addModifier(new Modifier("index_prefixes", false) { @Override public void modify(MappedFieldType ft) {