Skip to content

Commit

Permalink
PARQUET-2361: Reduce failure rate of unit test testParquetFileWithBlo…
Browse files Browse the repository at this point in the history
…omFilterWithFpp

Change-Id: Ic230f197b0996333a082bb05bd201963d05d862e
  • Loading branch information
fengjiajie committed Oct 14, 2023
1 parent de08d8d commit 0d3a20d
Showing 1 changed file with 20 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -288,14 +288,15 @@ public void testParquetFileWithBloomFilter() throws IOException {

@Test
public void testParquetFileWithBloomFilterWithFpp() throws IOException {
int totalCount = 100000;
double[] testFpp = {0.01, 0.05, 0.10, 0.15, 0.20, 0.25};
int buildBloomFilterCount = 100000;
double[] testFpps = {0.01, 0.05, 0.10, 0.15, 0.20, 0.25};
int randomStrLen = 12;
final int testBloomFilterCount = 200000;

Set<String> distinctStrings = new HashSet<>();
while (distinctStrings.size() < totalCount) {
Set<String> distinctStringsForFileGenerate = new HashSet<>();
while (distinctStringsForFileGenerate.size() < buildBloomFilterCount) {
String str = RandomStringUtils.randomAlphabetic(randomStrLen);
distinctStrings.add(str);
distinctStringsForFileGenerate.add(str);
}

MessageType schema = Types.buildMessage().
Expand All @@ -305,41 +306,41 @@ public void testParquetFileWithBloomFilterWithFpp() throws IOException {
GroupWriteSupport.setSchema(schema, conf);

GroupFactory factory = new SimpleGroupFactory(schema);
for (int i = 0; i < testFpp.length; i++) {
for (double testFpp : testFpps) {
File file = temp.newFile();
file.delete();
assertTrue(file.delete());
Path path = new Path(file.getAbsolutePath());
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
.withPageRowCountLimit(10)
.withConf(conf)
.withDictionaryEncoding(false)
.withBloomFilterEnabled("name", true)
.withBloomFilterNDV("name", totalCount)
.withBloomFilterFPP("name", testFpp[i])
.withBloomFilterNDV("name", buildBloomFilterCount)
.withBloomFilterFPP("name", testFpp)
.build()) {
java.util.Iterator<String> iterator = distinctStrings.iterator();
while (iterator.hasNext()) {
writer.write(factory.newGroup().append("name", iterator.next()));
for (String str : distinctStringsForFileGenerate) {
writer.write(factory.newGroup().append("name", str));
}
}
distinctStrings.clear();

try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData)
.readBloomFilter(blockMetaData.getColumns().get(0));

// The exist counts the number of times FindHash returns true.
int exist = 0;
while (distinctStrings.size() < totalCount) {
String str = RandomStringUtils.randomAlphabetic(randomStrLen - 2);
if (distinctStrings.add(str) &&
int falsePositive = 0;
Set<String> distinctStringsForProbe = new HashSet<>();
while (distinctStringsForProbe.size() < testBloomFilterCount) {
String str = RandomStringUtils.randomAlphabetic(randomStrLen - 1);
if (distinctStringsForProbe.add(str) &&
bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(str).toByteBuffer()))) {
exist++;
falsePositive++;
}
}
// The exist should be less than totalCount * fpp. Add 10% here for error space.
assertTrue(exist < totalCount * (testFpp[i] * 1.1) && exist > 0);
int expectedFalsePositiveMaxCount = (int) Math.floor(testBloomFilterCount * (testFpp * 1.1));
assertTrue(falsePositive < expectedFalsePositiveMaxCount && falsePositive > 0);
}
}
}
Expand Down

0 comments on commit 0d3a20d

Please sign in to comment.