Skip to content

Commit

Permalink
Add keyword fields above ignore_above to _ignored (#74418)
Browse files Browse the repository at this point in the history
Currently the `_ignore` field indexes and stores the names of every field in a document that has been ignored
because eg. it was malformed. The `ignore_above` option for keyword-type fields
serves a somewhat similar purpose, so this change add logix that  adds these
fields to the "_ignored" field as well for `keyword`, `wildcard` and
`icu_collation_keyword` fields.

Closes #74228
  • Loading branch information
Christoph Büscher authored Jun 22, 2021
1 parent ad8d96e commit 437805b
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.index.analysis.IndexableBinaryStringTools;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.plain.SortedSetOrdinalsIndexFieldData;
Expand Down Expand Up @@ -444,7 +444,12 @@ protected void parseCreateField(ParseContext context) throws IOException {
value = parser.textOrNull();
}

if (value == null || value.length() > ignoreAbove) {
if (value == null) {
return;
}

if (value.length() > ignoreAbove) {
context.addIgnoredField(name());
return;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,15 @@ public void testIgnoreAbove() throws IOException {
ParsedDocument doc = mapper.parse(source(b -> b.field("field", "elk")));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
fields = doc.rootDoc().getFields("_ignored");
assertEquals(0, fields.length);

doc = mapper.parse(source(b -> b.field("field", "elasticsearch")));
fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
fields = doc.rootDoc().getFields("_ignored");
assertEquals(1, fields.length);
assertEquals("field", fields[0].stringValue());
}

public void testUpdateIgnoreAbove() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,9 @@ public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutio
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);

CompiledAutomaton automaton = new CompiledAutomaton(a);

BytesRef searchBytes = searchAfter == null? null: new BytesRef(searchAfter);

if (automaton.type == AUTOMATON_TYPE.ALL) {
TermsEnum result = terms.iterator();
if (searchAfter != null) {
Expand All @@ -289,12 +289,12 @@ public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutio
}
return terms.intersect(automaton, searchBytes);
}

// Initialises with a seek to a given term but excludes that term
// from any results. The problem it addresses is that termsEnum.seekCeil()
// would work but either leaves us positioned on the seek term (if it exists) or the
// term after (if the seek term doesn't exist). That complicates any subsequent
// iteration logic so this class simplifies the pagination use case.
// would work but either leaves us positioned on the seek term (if it exists) or the
// term after (if the seek term doesn't exist). That complicates any subsequent
// iteration logic so this class simplifies the pagination use case.
final class SearchAfterTermsEnum extends FilteredTermsEnum {
private final BytesRef afterRef;

Expand All @@ -308,7 +308,7 @@ final class SearchAfterTermsEnum extends FilteredTermsEnum {
protected AcceptStatus accept(BytesRef term) {
return term.equals(afterRef) ? AcceptStatus.NO : AcceptStatus.YES;
}
}
}

@Override
public String typeName() {
Expand Down Expand Up @@ -475,7 +475,12 @@ protected void indexScriptValues(SearchLookup searchLookup, LeafReaderContext re

private void indexValue(ParseContext context, String value) {

if (value == null || value.length() > ignoreAbove) {
if (value == null) {
return;
}

if (value.length() > ignoreAbove) {
context.addIgnoredField(name());
return;
}

Expand Down Expand Up @@ -532,6 +537,6 @@ protected String contentType() {
public FieldMapper.Builder getMergeBuilder() {
return new Builder(simpleName(), indexAnalyzers, scriptCompiler).init(this);
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,16 @@ public void testIgnoreAbove() throws IOException {
ParsedDocument doc = mapper.parse(source(b -> b.field("field", "elk")));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
fields = doc.rootDoc().getFields("_ignored");
assertEquals(0, fields.length);

doc = mapper.parse(source(b -> b.field("field", "elasticsearch")));
fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);

fields = doc.rootDoc().getFields("_ignored");
assertEquals(1, fields.length);
assertEquals("field", fields[0].stringValue());
}

public void testNullValue() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -964,14 +964,17 @@ protected void parseCreateField(ParseContext context) throws IOException {
ParseContext.Document parseDoc = context.doc();

List<IndexableField> fields = new ArrayList<>();
createFields(value, parseDoc, fields);
if (value != null) {
if (value.length() <= ignoreAbove) {
createFields(value, parseDoc, fields);
} else {
context.addIgnoredField(name());
}
}
parseDoc.addAll(fields);
}

void createFields(String value, Document parseDoc, List<IndexableField>fields) {
if (value == null || value.length() > ignoreAbove) {
return;
}
String ngramValue = addLineEndChars(value);
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
fields.add(ngramField);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.mapper.ContentPath;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperTestCase;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.search.lookup.SearchLookup;
Expand Down Expand Up @@ -106,7 +108,7 @@ protected Collection<? extends Plugin> getPlugins() {
protected boolean supportsStoredFields() {
return false;
}

@Override
@Before
public void setUp() throws Exception {
Expand Down Expand Up @@ -142,12 +144,30 @@ public void testTooBigKeywordField() throws IOException {

Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery("*a*", null, null);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(1L));

reader.close();
dir.close();
}

public void testIgnoreAbove() throws IOException {
DocumentMapper mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("ignore_above", 5)));

ParsedDocument doc = mapper.parse(source(b -> b.field("field", "elk")));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
fields = doc.rootDoc().getFields("_ignored");
assertEquals(0, fields.length);

doc = mapper.parse(source(b -> b.field("field", "elasticsearch")));
fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);

fields = doc.rootDoc().getFields("_ignored");
assertEquals(1, fields.length);
assertEquals("field", fields[0].stringValue());
}

public void testBWCIndexVersion() throws IOException {
// Create old format index using wildcard ngram analyzer used in 7.9 launch
Directory dir = newDirectory();
Expand Down

0 comments on commit 437805b

Please sign in to comment.