Skip to content

Commit

Permalink
Combine regexp and set include/exclude filters to allow mix & matching
Browse files Browse the repository at this point in the history
  • Loading branch information
hchargois committed Sep 16, 2020
1 parent 1c2362f commit 3d12563
Showing 1 changed file with 73 additions and 79 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,8 @@ public static IncludeExclude merge(IncludeExclude include, IncludeExclude exclud
if (include.isPartitionBased()) {
throw new IllegalArgumentException("Cannot specify any excludes when using a partition-based include");
}
String includeMethod = include.isRegexBased() ? "regex" : "set";
String excludeMethod = exclude.isRegexBased() ? "regex" : "set";
if (includeMethod.equals(excludeMethod) == false) {
throw new IllegalArgumentException("Cannot mix a " + includeMethod + "-based include with a "
+ excludeMethod + "-based method");
}
if (include.isRegexBased()) {
return new IncludeExclude(include.include, exclude.exclude);
} else {
return new IncludeExclude(include.includeValues, exclude.excludeValues);
}

return new IncludeExclude(include.include, exclude.exclude, include.includeValues, exclude.excludeValues);
}

public static IncludeExclude parseInclude(XContentParser parser) throws IOException {
Expand Down Expand Up @@ -196,46 +187,41 @@ public boolean accept(BytesRef value) {
}
}

static class AutomatonBackedStringFilter extends StringFilter {
static class CombinedStringFilter extends StringFilter {

private final ByteRunAutomaton runAutomaton;

private AutomatonBackedStringFilter(Automaton automaton) {
this.runAutomaton = new ByteRunAutomaton(automaton);
}

/**
* Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
return runAutomaton.run(value.bytes, value.offset, value.length);
}
}

static class TermListBackedStringFilter extends StringFilter {

private final Set<BytesRef> valids;
private final Set<BytesRef> invalids;

TermListBackedStringFilter(Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
private CombinedStringFilter(Automaton automaton, Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
if (automaton != null) {
this.runAutomaton = new ByteRunAutomaton(automaton);
} else {
this.runAutomaton = null;
}
this.valids = includeValues;
this.invalids = excludeValues;
}

/**
* Returns whether the given value is accepted based on the
* {@code include} &amp; {@code exclude} sets.
* Returns whether the given value is accepted based on the {@code include} &amp; {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
return ((valids == null) || (valids.contains(value))) && ((invalids == null) || (!invalids.contains(value)));
if (valids != null && !valids.contains(value)) {
return false;
}

if (runAutomaton != null && !runAutomaton.run(value.bytes, value.offset, value.length)) {
return false;
}

return invalids == null || !invalids.contains(value);
}
}

public abstract static class OrdinalsFilter extends Filter {
public abstract LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException;

}

class PartitionedOrdinalsFilter extends OrdinalsFilter {
Expand All @@ -258,57 +244,64 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
}
}

static class AutomatonBackedOrdinalsFilter extends OrdinalsFilter {
static class CombinedOrdinalsFilter extends OrdinalsFilter {

private final CompiledAutomaton compiled;

private AutomatonBackedOrdinalsFilter(Automaton automaton) {
this.compiled = new CompiledAutomaton(automaton);
}

/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*
*/
@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
acceptedGlobalOrdinals.set(globalTermsEnum.ord());
}
return acceptedGlobalOrdinals;
}

}

static class TermListBackedOrdinalsFilter extends OrdinalsFilter {

private final SortedSet<BytesRef> includeValues;
private final SortedSet<BytesRef> excludeValues;

TermListBackedOrdinalsFilter(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
private CombinedOrdinalsFilter(Automaton automaton, SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
if (automaton != null) {
this.compiled = new CompiledAutomaton(automaton);
} else {
this.compiled = null;
}
this.includeValues = includeValues;
this.excludeValues = excludeValues;
}

/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*
*/
@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
LongBitSet acceptedGlobalOrdinals = null;
if (includeValues != null) {
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
for (BytesRef term : includeValues) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.set(ord);
}
}
} else if (acceptedGlobalOrdinals.length() > 0) {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}

if (compiled != null) {
LongBitSet automatonGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
automatonGlobalOrdinals.set(globalTermsEnum.ord());
}

if (acceptedGlobalOrdinals == null) {
acceptedGlobalOrdinals = automatonGlobalOrdinals;
} else {
acceptedGlobalOrdinals.and(automatonGlobalOrdinals);
}
}

if (acceptedGlobalOrdinals == null) {
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
if (acceptedGlobalOrdinals.length() > 0) {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}
}

if (excludeValues != null) {
for (BytesRef term : excludeValues) {
long ord = globalOrdinals.lookupTerm(term);
Expand All @@ -319,9 +312,9 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
}
return acceptedGlobalOrdinals;
}

}


private final RegExp include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
private final int incZeroBasedPartition;
Expand All @@ -343,6 +336,15 @@ public IncludeExclude(RegExp include, RegExp exclude) {
this.incNumPartitions = 0;
}

public IncludeExclude(RegExp include, RegExp exclude, SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
this.include = include;
this.exclude = exclude;
this.includeValues = includeValues;
this.excludeValues = excludeValues;
this.incZeroBasedPartition = 0;
this.incNumPartitions = 0;
}

public IncludeExclude(String include, String exclude) {
this(include == null ? null : new RegExp(include), exclude == null ? null : new RegExp(exclude));
}
Expand Down Expand Up @@ -573,29 +575,25 @@ public boolean isPartitionBased() {

private Automaton toAutomaton() {
Automaton a = null;
if (include == null && exclude == null) {
return a;
}
if (include != null) {
a = include.toAutomaton();
} else if (includeValues != null) {
a = Automata.makeStringUnion(includeValues);
} else {
a = Automata.makeAnyString();
}
if (exclude != null) {
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
} else if (excludeValues != null) {
a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
return a;
}

public StringFilter convertToStringFilter(DocValueFormat format) {
if (isRegexBased()) {
return new AutomatonBackedStringFilter(toAutomaton());
}
if (isPartitionBased()){
return new PartitionedStringFilter();
}
return new TermListBackedStringFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
return new CombinedStringFilter(toAutomaton(), parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
}

private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUserFormattedValues, DocValueFormat format) {
Expand All @@ -612,15 +610,11 @@ private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUser
}

public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {

if (isRegexBased()) {
return new AutomatonBackedOrdinalsFilter(toAutomaton());
}
if (isPartitionBased()){
return new PartitionedOrdinalsFilter();
}

return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
return new CombinedOrdinalsFilter(toAutomaton(), parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
}

public LongFilter convertToLongFilter(DocValueFormat format) {
Expand Down

0 comments on commit 3d12563

Please sign in to comment.