forked from apache/druid
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Direct UTF-8 access for "in" filters.
Directly related: 1) InDimFilter: Store stored Strings (in ValuesSet) plus sorted UTF-8 ByteBuffers (in valuesUtf8). Use valuesUtf8 whenever possible. If necessary, the input set is copied into a ValuesSet. Much logic is simplified, because we always know what type the values set will be. I think that there won't even be an efficiency loss in most cases. InDimFilter is most frequently created by deserialization, and this patch updates the JsonCreator constructor to deserialize directly into a ValuesSet. 2) Add Utf8ValueSetIndex, which InDimFilter uses to avoid UTF-8 decodes during index lookups. 3) Add unsigned comparator to ByteBufferUtils and use it in GenericIndexed.BYTE_BUFFER_STRATEGY. This is important because UTF-8 bytes can be compared as bytes if, and only if, the comparison is unsigned. 4) Add specialization to GenericIndexed.singleThreaded().indexOf that avoids needless ByteBuffer allocations. 5) Clarify that objects returned by ColumnIndexSupplier.as are not thread-safe. DictionaryEncodedStringIndexSupplier now calls singleThreaded() on all relevant GenericIndexed objects, saving a ByteBuffer allocation per access. Also: 1) Fix performance regression in LikeFilter: since apache#12315, it applied the suffix matcher to all values in range even for type MATCH_ALL. 2) Add ObjectStrategy.canCompare() method. This fixes LikeFilterBenchmark, which was broken due to calls to strategy.compare in GenericIndexed.fromIterable.
- Loading branch information
Showing
34 changed files
with
699 additions
and
218 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
133 changes: 133 additions & 0 deletions
133
benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.druid.benchmark; | ||
|
||
import com.google.common.collect.FluentIterable; | ||
import org.apache.druid.collections.bitmap.BitmapFactory; | ||
import org.apache.druid.collections.bitmap.ImmutableBitmap; | ||
import org.apache.druid.collections.bitmap.MutableBitmap; | ||
import org.apache.druid.collections.bitmap.RoaringBitmapFactory; | ||
import org.apache.druid.common.config.NullHandling; | ||
import org.apache.druid.java.util.common.StringUtils; | ||
import org.apache.druid.query.filter.ColumnIndexSelector; | ||
import org.apache.druid.query.filter.InDimFilter; | ||
import org.apache.druid.segment.data.BitmapSerdeFactory; | ||
import org.apache.druid.segment.data.GenericIndexed; | ||
import org.apache.druid.segment.data.RoaringBitmapSerdeFactory; | ||
import org.apache.druid.segment.filter.Filters; | ||
import org.apache.druid.segment.serde.DictionaryEncodedStringIndexSupplier; | ||
import org.openjdk.jmh.annotations.Benchmark; | ||
import org.openjdk.jmh.annotations.BenchmarkMode; | ||
import org.openjdk.jmh.annotations.Fork; | ||
import org.openjdk.jmh.annotations.Measurement; | ||
import org.openjdk.jmh.annotations.Mode; | ||
import org.openjdk.jmh.annotations.OutputTimeUnit; | ||
import org.openjdk.jmh.annotations.Param; | ||
import org.openjdk.jmh.annotations.Scope; | ||
import org.openjdk.jmh.annotations.Setup; | ||
import org.openjdk.jmh.annotations.State; | ||
import org.openjdk.jmh.annotations.Warmup; | ||
import org.openjdk.jmh.infra.Blackhole; | ||
|
||
import java.nio.ByteBuffer; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
|
||
@State(Scope.Benchmark) | ||
@Fork(value = 1) | ||
@Warmup(iterations = 10) | ||
@Measurement(iterations = 10) | ||
public class InFilterBenchmark | ||
{ | ||
static { | ||
NullHandling.initializeForTests(); | ||
} | ||
|
||
private static final int START_INT = 10_000_000; | ||
|
||
private InDimFilter inFilter; | ||
|
||
// cardinality of the dictionary. it will contain this many ints (as strings, of course), starting at START_INT, | ||
// even numbers only. | ||
@Param({"1000000"}) | ||
int dictionarySize; | ||
|
||
// cardinality of the "in" filter. half of its values will be in the dictionary, half will not. | ||
@Param({"10000"}) | ||
int filterSize; | ||
|
||
// selector will contain a "dictionarySize" number of bitmaps; each one contains a single int. | ||
// this benchmark is not about bitmap union speed, so no need for that part to be realistic. | ||
ColumnIndexSelector selector; | ||
|
||
@Setup | ||
public void setup() | ||
{ | ||
final BitmapFactory bitmapFactory = new RoaringBitmapFactory(); | ||
final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null); | ||
final Iterable<Integer> ints = intGenerator(); | ||
final GenericIndexed<String> dictionary = GenericIndexed.fromIterable( | ||
FluentIterable.from(ints) | ||
.transform(Object::toString), | ||
GenericIndexed.STRING_STRATEGY | ||
); | ||
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable( | ||
FluentIterable.from(ints) | ||
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), | ||
GenericIndexed.BYTE_BUFFER_STRATEGY | ||
); | ||
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable( | ||
() -> IntStream.range(0, dictionarySize) | ||
.mapToObj( | ||
i -> { | ||
final MutableBitmap mutableBitmap = bitmapFactory.makeEmptyMutableBitmap(); | ||
mutableBitmap.add(i); | ||
return bitmapFactory.makeImmutableBitmap(mutableBitmap); | ||
} | ||
) | ||
.iterator(), | ||
serdeFactory.getObjectStrategy() | ||
); | ||
selector = new MockColumnIndexSelector( | ||
bitmapFactory, | ||
new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null) | ||
); | ||
inFilter = new InDimFilter( | ||
"dummy", | ||
IntStream.range(START_INT, START_INT + filterSize).mapToObj(String::valueOf).collect(Collectors.toSet()) | ||
); | ||
} | ||
|
||
@Benchmark | ||
@BenchmarkMode(Mode.AverageTime) | ||
@OutputTimeUnit(TimeUnit.MICROSECONDS) | ||
public void doFilter(Blackhole blackhole) | ||
{ | ||
final ImmutableBitmap bitmapIndex = Filters.computeDefaultBitmapResults(inFilter, selector); | ||
blackhole.consume(bitmapIndex); | ||
} | ||
|
||
private Iterable<Integer> intGenerator() | ||
{ | ||
// i * 2 => half of these values will be present in the inFilter, half won't. | ||
return () -> IntStream.range(0, dictionarySize).map(i -> START_INT + i * 2).boxed().iterator(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.