Improve lookup for "include_unmapped" field pattern

Currently we use a CharacterRunAutomaton build from all field pattern that have the "include_unmapped" option set. This can lead to unnecessarily large automata for cases where the user unessesarily list all known fields and adds the "include_unmapped" option. We only really need the automaton for pattern that contain wildcards and can look up any other field path directly and only need to do so if the field path isn't indeed mapped. Relates to elastic#69983
cbuescher · Mar 9, 2021 · 11a29c4 · 11a29c4
1 parent 80d486f
commit 11a29c4
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 4 deletions.
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/FieldFetcher.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/FieldFetcher.java
@@ -13,6 +13,7 @@
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.document.DocumentField;
 import org.elasticsearch.common.regex.Regex;
+import org.elasticsearch.common.xcontent.support.XContentMapValues;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.NestedValueFetcher;
 import org.elasticsearch.index.mapper.ObjectMapper;
@@ -38,6 +39,11 @@
  */
 public class FieldFetcher {
 
+    /**
+     * Default maximum number of states in the automaton that looks up unmapped fields.
+     */
+    private static final int AUTOMATON_MAX_DETERMINIZED_STATES = 100000;
+
     public static FieldFetcher create(SearchExecutionContext context,
         Collection<FieldAndFormat> fieldAndFormats) {
         Set<String> nestedMappingPaths = context.hasNested()
@@ -115,23 +121,31 @@ private static FieldFetcher create(SearchExecutionContext context,
         }
 
         CharacterRunAutomaton unmappedFieldsFetchAutomaton = null;
-        if (unmappedFetchPattern.isEmpty() == false) {
+        Map<Boolean, List<String>> partitions = unmappedFetchPattern.stream()
+            .collect(Collectors.partitioningBy((s -> Regex.isSimpleMatchPattern(s))));
+        List<String> unmappedWildcardPattern = partitions.get(true);
+        List<String> unmappedConcreteFields = partitions.get(false);
+        if (unmappedWildcardPattern.isEmpty() == false) {
             unmappedFieldsFetchAutomaton = new CharacterRunAutomaton(
-                Regex.simpleMatchToAutomaton(unmappedFetchPattern.toArray(new String[unmappedFetchPattern.size()]))
+                Regex.simpleMatchToAutomaton(unmappedWildcardPattern.toArray(new String[unmappedWildcardPattern.size()])),
+                AUTOMATON_MAX_DETERMINIZED_STATES
             );
         }
-        return new FieldFetcher(fieldContexts, unmappedFieldsFetchAutomaton);
+        return new FieldFetcher(fieldContexts, unmappedFieldsFetchAutomaton, unmappedConcreteFields);
     }
 
     private final Map<String, FieldContext> fieldContexts;
     private final CharacterRunAutomaton unmappedFieldsFetchAutomaton;
+    private final List<String> unmappedConcreteFields;
 
     private FieldFetcher(
         Map<String, FieldContext> fieldContexts,
-        @Nullable CharacterRunAutomaton unmappedFieldsFetchAutomaton
+        @Nullable CharacterRunAutomaton unmappedFieldsFetchAutomaton,
+        @Nullable List<String> unmappedConcreteFields
     ) {
         this.fieldContexts = fieldContexts;
         this.unmappedFieldsFetchAutomaton = unmappedFieldsFetchAutomaton;
+        this.unmappedConcreteFields = unmappedConcreteFields;
     }
 
     public Map<String, DocumentField> fetch(SourceLookup sourceLookup) throws IOException {
@@ -148,6 +162,17 @@ public Map<String, DocumentField> fetch(SourceLookup sourceLookup) throws IOExce
         if (this.unmappedFieldsFetchAutomaton != null) {
             collectUnmapped(documentFields, sourceLookup.source(), "", 0);
         }
+        if (this.unmappedConcreteFields != null) {
+            for (String path : unmappedConcreteFields) {
+                if (this.fieldContexts.containsKey(path)) {
+                    continue; // this is actually a mapped field
+                }
+                List<Object> values = XContentMapValues.extractRawValues(path, sourceLookup.source());
+                if (values.isEmpty() == false) {
+                    documentFields.put(path, new DocumentField(path, values));
+                }
+            }
+        }
         return documentFields;
     }
 

diff --git a/server/src/test/java/org/elasticsearch/search/fetch/subphase/FieldFetcherTests.java b/server/src/test/java/org/elasticsearch/search/fetch/subphase/FieldFetcherTests.java
@@ -8,6 +8,7 @@
 
 package org.elasticsearch.search.fetch.subphase;
 
+import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.common.Strings;
@@ -846,6 +847,23 @@ public void testLastFormatWins() throws IOException {
         assertThat(fields.get("date_field").getValues().get(1), equalTo("12"));
     }
 
+    /**
+     * Field pattern retrieved with "inlcude_unmapped" use an automaton with a maximal allowed size internally.
+     * This test checks we have a bound in place to avoid misuse of this with exceptionally large field patterns
+     */
+    public void testTooManyUnmappedFieldWildcardPattern() throws IOException {
+        MapperService mapperService = createMapperService();
+
+        XContentBuilder source = XContentFactory.jsonBuilder().startObject().field("a", "foo").endObject();
+
+        List<FieldAndFormat> fieldAndFormatList = new ArrayList<>();
+        boolean includeUnmapped = true;
+        for (int i = 0; i < 1000; i++) {
+            fieldAndFormatList.add(new FieldAndFormat(randomAlphaOfLength(150) + "*", null, includeUnmapped));
+        }
+        expectThrows(TooComplexToDeterminizeException.class, () -> fetchFields(mapperService, source, fieldAndFormatList));
+    }
+
     private List<FieldAndFormat> fieldAndFormatList(String name, String format, boolean includeUnmapped) {
         return Collections.singletonList(new FieldAndFormat(name, format, includeUnmapped));
     }