From 65820e5170ed15e91cc3349e6dd4da90689ecd5d Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Tue, 18 May 2021 09:37:53 -0700 Subject: [PATCH] LUCENE-9953: Make FacetResult#value accurate for LongValueFacetCounts multi-value doc cases (#131) Co-authored-by: Greg Miller --- lucene/CHANGES.txt | 4 ++++ .../src/java/org/apache/lucene/facet/FacetResult.java | 7 +++++-- .../org/apache/lucene/facet/LongValueFacetCounts.java | 8 ++++++-- .../org/apache/lucene/facet/TestLongValueFacetCounts.java | 8 ++++---- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ae1b2302c774..2c19c5001261 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -390,6 +390,10 @@ Bug Fixes * LUCENE-9958: Fixed performance regression for boolean queries that configure a minimum number of matching clauses. (Adrien Grand, Matt Weber) + +* LUCENE-9953: LongValueFacetCounts should count each document at most once when determining + the total count for a dimension. Prior to this fix, multi-value docs could contribute a > 1 + count to the dimension count. (Greg Miller) Other --------------------- diff --git a/lucene/facet/src/java/org/apache/lucene/facet/FacetResult.java b/lucene/facet/src/java/org/apache/lucene/facet/FacetResult.java index bbc61ce3fac4..39556466c53a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/FacetResult.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/FacetResult.java @@ -28,9 +28,12 @@ public final class FacetResult { public final String[] path; /** - * Total value for this path (sum of all child counts, or sum of all child values), even those not - * included in the topN. + * Total number of documents containing a value for this path, even those not included in the + * topN. If a document contains multiple values for the same path, it will only be counted once in + * this value. */ + // TODO: This may not hold true for SSDV faceting, where docs can be counted more than + // once. We should fix this. See LUCENE-9952 public final Number value; /** How many child labels were encountered. */ diff --git a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java index 520d931d66c0..cd331ace14eb 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java @@ -160,7 +160,9 @@ private void count(String field, List matchingDocs) throws IOExcep for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { int limit = multiValues.docValueCount(); - totCount += limit; + if (limit > 0) { + totCount++; + } for (int i = 0; i < limit; i++) { increment(multiValues.nextValue()); } @@ -204,7 +206,9 @@ private void countAll(IndexReader reader, String field) throws IOException { while (multiValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int limit = multiValues.docValueCount(); - totCount += limit; + if (limit > 0) { + totCount++; + } for (int i = 0; i < limit; i++) { increment(multiValues.nextValue()); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java index 5821052d12ea..7fa6dc0a5932 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java @@ -429,7 +429,8 @@ public void testRandomMultiValued() throws Exception { int expectedChildCount = 0; int expectedTotalCount = 0; for (int i = 0; i < valueCount; i++) { - if (values[i] != null) { + if (values[i] != null && values[i].length > 0) { + expectedTotalCount++; for (long value : values[i]) { Integer curCount = expected.get(value); if (curCount == null) { @@ -437,7 +438,6 @@ public void testRandomMultiValued() throws Exception { expectedChildCount++; } expected.put(value, curCount + 1); - expectedTotalCount++; } } } @@ -520,9 +520,9 @@ public void testRandomMultiValued() throws Exception { expectedChildCount = 0; int totCount = 0; for (int i = minId; i <= maxId; i++) { - if (values[i] != null) { + if (values[i] != null && values[i].length > 0) { + totCount++; for (long value : values[i]) { - totCount++; Integer curCount = expected.get(value); if (curCount == null) { expectedChildCount++;