Skip to content

Commit

Permalink
LUCENE-8477: Automatically rewrite disjunctions when internal gaps ma…
Browse files Browse the repository at this point in the history
…tter (#620)

We have a number of IntervalsSource implementations where automatic minimization of
disjunctions can lead to surprising results:

* PHRASE queries can miss matches because a longer matching sub-source is minimized
  away, leaving a gap
* MAXGAPS queries can miss matches for the same reason
* CONTAINING, NOT_CONTAINING, CONTAINED_BY and NOT_CONTAINED_BY queries
  can miss matches if the 'big' interval gets minimized

The proper way to deal with this is to rewrite the queries by pulling disjunctions to the top
of the query tree, so that PHRASE("a", OR(PHRASE("b", "c"), "c")) is rewritten to
OR(PHRASE("a", "b", "c"), PHRASE("a", "c")). To be able to do this generally, we need to
add a new pullUpDisjunctions() method to IntervalsSource that performs this rewriting
for each source that it would apply to.

Because these rewritten queries will in general be less efficient due to the duplication of
effort (eg the rewritten PHRASE query above pulls 5 term iterators rather than 4 in the
original), we also add an option to Intervals.or() that will prevent this happening, so that
consumers can choose speed over accuracy if it suits their usecase.
  • Loading branch information
romseygeek authored Mar 27, 2019
1 parent e7939d5 commit f1782d0
Show file tree
Hide file tree
Showing 35 changed files with 1,964 additions and 1,012 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.search.intervals;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

class BlockIntervalsSource extends ConjunctionIntervalsSource {

static IntervalsSource build(List<IntervalsSource> subSources) {
if (subSources.size() == 1) {
return subSources.get(0);
}
return Intervals.or(Disjunctions.pullUp(subSources, BlockIntervalsSource::new));
}

private static List<IntervalsSource> flatten(List<IntervalsSource> sources) {
List<IntervalsSource> flattened = new ArrayList<>();
for (IntervalsSource s : sources) {
if (s instanceof BlockIntervalsSource) {
flattened.addAll(((BlockIntervalsSource)s).subSources);
}
else {
flattened.add(s);
}
}
return flattened;
}

private BlockIntervalsSource(List<IntervalsSource> sources) {
super(flatten(sources), true);
}

@Override
protected IntervalIterator combine(List<IntervalIterator> iterators) {
return new BlockIntervalIterator(iterators);
}

@Override
public int minExtent() {
int minExtent = 0;
for (IntervalsSource subSource : subSources) {
minExtent += subSource.minExtent();
}
return minExtent;
}

@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Collections.singletonList(this); // Disjunctions already pulled up in build()
}

@Override
public int hashCode() {
return Objects.hash(subSources);
}

@Override
public boolean equals(Object other) {
if (other instanceof BlockIntervalsSource == false) return false;
BlockIntervalsSource b = (BlockIntervalsSource) other;
return Objects.equals(this.subSources, b.subSources);
}

@Override
public String toString() {
return "BLOCK(" + subSources.stream().map(IntervalsSource::toString).collect(Collectors.joining(",")) + ")";
}

private static class BlockIntervalIterator extends ConjunctionIntervalIterator {

int start = -1, end = -1;

BlockIntervalIterator(List<IntervalIterator> subIterators) {
super(subIterators);
}

@Override
public int start() {
return start;
}

@Override
public int end() {
return end;
}

@Override
public int gaps() {
return 0;
}

@Override
public int nextInterval() throws IOException {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
int i = 1;
while (i < subIterators.size()) {
while (subIterators.get(i).start() <= subIterators.get(i - 1).end()) {
if (subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
}
if (subIterators.get(i).start() == subIterators.get(i - 1).end() + 1) {
i = i + 1;
}
else {
if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
return start = end = IntervalIterator.NO_MORE_INTERVALS;
i = 1;
}
}
start = subIterators.get(0).start();
end = subIterators.get(subIterators.size() - 1).end();
return start;
}

@Override
protected void reset() {
start = end = -1;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

import org.apache.lucene.index.LeafReaderContext;
Expand All @@ -31,28 +30,15 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;

class ConjunctionIntervalsSource extends IntervalsSource {
abstract class ConjunctionIntervalsSource extends IntervalsSource {

protected final List<IntervalsSource> subSources;
protected final IntervalFunction function;
protected final boolean isMinimizing;

ConjunctionIntervalsSource(List<IntervalsSource> subSources, IntervalFunction function) {
protected ConjunctionIntervalsSource(List<IntervalsSource> subSources, boolean isMinimizing) {
assert subSources.size() > 1;
this.subSources = subSources;
this.function = function;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ConjunctionIntervalsSource that = (ConjunctionIntervalsSource) o;
return Objects.equals(subSources, that.subSources) &&
Objects.equals(function, that.function);
}

@Override
public String toString() {
return function + subSources.stream().map(Object::toString).collect(Collectors.joining(",", "(", ")"));
this.isMinimizing = isMinimizing;
}

@Override
Expand All @@ -65,49 +51,40 @@ public void visit(String field, QueryVisitor visitor) {
}

@Override
public int minExtent() {
int minExtent = 0;
for (IntervalsSource source : subSources) {
minExtent += source.minExtent();
}
return minExtent;
}

@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
public final IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
List<IntervalIterator> subIntervals = new ArrayList<>();
for (IntervalsSource source : subSources) {
IntervalIterator it = source.intervals(field, ctx);
if (it == null)
return null;
subIntervals.add(it);
}
return function.apply(subIntervals);
return combine(subIntervals);
}

protected abstract IntervalIterator combine(List<IntervalIterator> iterators);

@Override
public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
List<MatchesIterator> subs = new ArrayList<>();
for (IntervalsSource source : subSources) {
MatchesIterator mi = source.matches(field, ctx, doc);
if (mi == null) {
return null;
}
if (isMinimizing) {
mi = new CachingMatchesIterator(mi);
}
subs.add(mi);
}
IntervalIterator it = function.apply(subs.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList()));
IntervalIterator it = combine(subs.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList()));
if (it.advance(doc) != doc) {
return null;
}
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return null;
}
return new ConjunctionMatchesIterator(it, subs);
}

@Override
public int hashCode() {
return Objects.hash(subSources, function);
return isMinimizing ? new MinimizingConjunctionMatchesIterator(it, subs) : new ConjunctionMatchesIterator(it, subs);
}

private static class ConjunctionMatchesIterator implements MatchesIterator {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.search.intervals;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Objects;

class ContainedByIntervalsSource extends ConjunctionIntervalsSource {

static IntervalsSource build(IntervalsSource small, IntervalsSource big) {
return Intervals.or(Disjunctions.pullUp(big, s -> new ContainedByIntervalsSource(small, s)));
}

private final IntervalsSource small;
private final IntervalsSource big;

private ContainedByIntervalsSource(IntervalsSource small, IntervalsSource big) {
super(Arrays.asList(small, big), false);
this.small = small;
this.big = big;
}

@Override
protected IntervalIterator combine(List<IntervalIterator> iterators) {
assert iterators.size() == 2;
IntervalIterator a = iterators.get(0);
IntervalIterator b = iterators.get(1);
return new FilteringIntervalIterator(a, b) {
@Override
public int nextInterval() throws IOException {
if (bpos == false)
return IntervalIterator.NO_MORE_INTERVALS;
while (a.nextInterval() != IntervalIterator.NO_MORE_INTERVALS) {
while (b.end() < a.end()) {
if (b.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
}
if (b.start() <= a.start())
return a.start();
}
bpos = false;
return IntervalIterator.NO_MORE_INTERVALS;
}
};
}

@Override
public int minExtent() {
return small.minExtent();
}

@Override
public Collection<IntervalsSource> pullUpDisjunctions() {
return Disjunctions.pullUp(big, s -> new ContainedByIntervalsSource(small, s));
}

@Override
public int hashCode() {
return Objects.hashCode(subSources);
}

@Override
public boolean equals(Object other) {
if (other instanceof ContainedByIntervalsSource == false) return false;
ContainedByIntervalsSource o = (ContainedByIntervalsSource) other;
return Objects.equals(this.subSources, o.subSources);
}

@Override
public String toString() {
return "CONTAINED_BY(" + small + "," + big + ")";
}
}
Loading

0 comments on commit f1782d0

Please sign in to comment.