Index phrases (#30450)

Specifying `index_phrases: true` on a text field mapping will add a subsidiary [field]._index_phrase field, indexing two-term shingles from the parent field. The parent analysis chain is re-used, wrapped with a FixedShingleFilter. At query time, if a phrase match query is executed, the mapping will redirect it to run against the subsidiary field. This should trade faster phrase querying for a larger index and longer indexing times. Relates to #27049
elastic · Jun 4, 2018 · 0427339 · 0427339
1 parent dc8a4fb
commit 0427339
Show file tree

Hide file tree

Showing 8 changed files with 457 additions and 14 deletions.
diff --git a/docs/reference/mapping/types/text.asciidoc b/docs/reference/mapping/types/text.asciidoc
@@ -96,6 +96,14 @@ The following parameters are accepted by `text` fields:
     the expense of a larger index. Accepts an
     <<index-prefix-config,`index-prefix configuration block`>>
 
+<<index-phrases,`index_phrases`>>::
+
+    If enabled, two-term word combinations ('shingles') are indexed into a separate
+    field.  This allows exact phrase queries to run more efficiently, at the expense
+    of a larger index.  Note that this works best when stopwords are not removed,
+    as phrases containing stopwords will not use the subsidiary field and will fall
+    back to a standard phrase query.  Accepts `true` or `false` (default).
+
 <<norms,`norms`>>::
 
     Whether field-length should be taken into account when scoring queries.

diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/200_index_phrase_search.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/200_index_phrase_search.yml
@@ -0,0 +1,67 @@
+---
+"search with indexed phrases":
+  - skip:
+      version: " - 6.99.99"
+      reason: index_phrase is only available as of 7.0.0
+  - do:
+      indices.create:
+        index:  test
+        body:
+          mappings:
+            test:
+              properties:
+                text:
+                  type: text
+                  index_phrases: true
+
+  - do:
+      index:
+          index:  test
+          type:   test
+          id:     1
+          body:   { text: "peter piper picked a peck of pickled peppers" }
+
+  - do:
+      indices.refresh:
+        index: [test]
+
+  - do:
+      search:
+        index: test
+        body:
+          query:
+            match_phrase:
+              text:
+                query: "peter piper"
+
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        index: test
+        q: '"peter piper"~1'
+        df: text
+
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        index: test
+        body:
+          query:
+            match_phrase:
+              text: "peter piper picked"
+
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        index: test
+        body:
+          query:
+            match_phrase:
+              text: "piper"
+
+  - match: {hits.total: 1}
+
+
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java
@@ -19,6 +19,7 @@
 
 package org.elasticsearch.index.mapper;
 
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
@@ -43,6 +44,7 @@
 import org.elasticsearch.index.query.QueryRewriteContext;
 import org.elasticsearch.index.query.QueryShardContext;
 import org.elasticsearch.index.query.QueryShardException;
+import org.elasticsearch.index.search.MatchQuery;
 import org.elasticsearch.index.similarity.SimilarityProvider;
 import org.elasticsearch.search.DocValueFormat;
 import org.joda.time.DateTimeZone;
@@ -353,6 +355,14 @@ public Query regexpQuery(String value, int flags, int maxDeterminizedStates, @Nu
 
     public abstract Query existsQuery(QueryShardContext context);
 
+    public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {
+        throw new IllegalArgumentException("Can only use phrase queries on text fields - not on [" + name + "] which is of type [" + typeName() + "]");
+    }
+
+    public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {
+        throw new IllegalArgumentException("Can only use phrase queries on text fields - not on [" + name + "] which is of type [" + typeName() + "]");
+    }
+
     /**
      * An enum used to describe the relation between the range of terms in a
      * shard when compared with a query range