From 10608b5bada3dce3a7e3f8c1d2c5d74e61f7b79a Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 4 Dec 2024 10:34:55 +0100 Subject: [PATCH] Improve search equivalence tests. This addresses an existing TODO about giving terms a zipfian distribution, and disables query caching to make sure that two-phase iterators are properly tested. --- .../tests/search/SearchEquivalenceTestBase.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java index 63a478951410..8831a3fcc7ae 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/SearchEquivalenceTestBase.java @@ -94,7 +94,11 @@ public static void beforeClass() throws Exception { reader = iw.getReader(); s1 = newSearcher(reader); + // Disable the query cache, which converts two-phase iterators to normal iterators, while we + // want to make sure two-phase iterators are exercised. + s1.setQueryCache(null); s2 = newSearcher(reader); + s2.setQueryCache(null); iw.close(); } @@ -114,7 +118,6 @@ public static void afterClass() throws Exception { * tokenization can be assumed to be on whitespace. */ static String randomFieldContents() { - // TODO: zipf-like distribution StringBuilder sb = new StringBuilder(); int numTerms = random().nextInt(15); for (int i = 0; i < numTerms; i++) { @@ -128,7 +131,13 @@ static String randomFieldContents() { /** returns random character (a-z) */ static char randomChar() { - return (char) TestUtil.nextInt(random(), 'a', 'z'); + char c = (char) TestUtil.nextInt(random(), 'a', 'z'); + if (random().nextBoolean()) { + // bias towards earlier chars, so that chars have a ~ zipfian distribution with earlier chars + // having a higher frequency + c = (char) TestUtil.nextInt(random(), 'a', c); + } + return c; } /** returns a term suitable for searching. terms are single characters in lowercase (a-z) */