Skip to content

Commit

Permalink
fix NPE during completeness calculation; refactor completeness calcul…
Browse files Browse the repository at this point in the history
…ation
  • Loading branch information
jmkeil committed Oct 2, 2024
1 parent 75d12c1 commit 9e99d80
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 100 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ private ResourcePair(Resource first, Resource second) {
this.second = second;
}

public static Set<Resource> getResourcesOfPairs(Iterable<ResourcePair> pairs) {
Set<Resource> resources = new HashSet<>();
for (ResourcePair pair : pairs) {
resources.add(pair.first);
resources.add(pair.second);
}
return resources;
}

public static Set<ResourcePair> getPairsOf(Set<Resource> resources) {
Set<ResourcePair> pairs = new HashSet<>();
for (Resource first : resources) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*-
* Copyright © 2019-2022 Heinz Nixdorf Chair for Distributed Information Systems,
* Friedrich Schiller University Jena (http://www.fusion.uni-jena.de/)
* Copyright © 2023-2024 Jan Martin Keil ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
-*/

package de.uni_jena.cs.fusion.abecto.measure;

import de.uni_jena.cs.fusion.abecto.Aspect;
import de.uni_jena.cs.fusion.abecto.Metadata;
import de.uni_jena.cs.fusion.abecto.ResourcePair;
import de.uni_jena.cs.fusion.abecto.vocabulary.AV;
import de.uni_jena.cs.fusion.abecto.vocabulary.OM;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.Resource;

import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.*;

public class Completeness extends Ratio<Resource> {

public Completeness() {
super(AV.marCompletenessThomas08, OM.one);
}

public static Completeness calculate(SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) {
Set<ResourcePair> datasetPairs = getDatasetPairsWithSufficientData(absoluteCoverage, deduplicatedCount);
long totalPairwiseOverlap = calculateTotalPairwiseOverlap(datasetPairs, absoluteCoverage);
if (totalPairwiseOverlap != 0) {
BigDecimal estimatedPopulationSize = calculateEstimatedPopulationSize(datasetPairs, deduplicatedCount, totalPairwiseOverlap);
Set<Resource> datasets = ResourcePair.getResourcesOfPairs(datasetPairs);
return calculateCompleteness(datasets, deduplicatedCount, estimatedPopulationSize);
}
return new Completeness(); // empty
}

private static Set<ResourcePair> getDatasetPairsWithSufficientData(SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) {
Set<ResourcePair> datasetPairs = new HashSet<>(absoluteCoverage.keySet());
Set<Resource> datasetsWithDeduplicatedCount = deduplicatedCount.keySet();
datasetPairs.removeIf(pair -> notBothContainedIn(pair, datasetsWithDeduplicatedCount));
return datasetPairs;
}

private static boolean notBothContainedIn(ResourcePair pair, Collection<Resource> collection) {
return !collection.contains(pair.first) || !collection.contains(pair.second);
}

private static long calculateTotalPairwiseOverlap(Iterable<ResourcePair> datasetPairs, SymmetricPerDatasetPairCount absoluteCoverage) {
long totalPairwiseOverlap = 0L;
for (ResourcePair datasetPair : datasetPairs) {
if (absoluteCoverage.contains(datasetPair)) {
totalPairwiseOverlap += absoluteCoverage.get(datasetPair);
}
}
return totalPairwiseOverlap;
}

private static BigDecimal calculateEstimatedPopulationSize(Iterable<ResourcePair> datasetPairs, PerDatasetCount deduplicatedCount, long totalPairwiseOverlap) {
BigDecimal estimatedPopulationSize = BigDecimal.ZERO;
for (ResourcePair datasetPair : datasetPairs) {
BigDecimal deduplicatedCount1 = BigDecimal.valueOf(deduplicatedCount.get(datasetPair.first));
BigDecimal deduplicatedCount2 = BigDecimal.valueOf(deduplicatedCount.get(datasetPair.second));
estimatedPopulationSize = estimatedPopulationSize.add(deduplicatedCount1.multiply(deduplicatedCount2));
}
estimatedPopulationSize = estimatedPopulationSize.divide(BigDecimal.valueOf(totalPairwiseOverlap), SCALE,
RoundingMode.HALF_UP);
return estimatedPopulationSize;
}

private static Completeness calculateCompleteness(Iterable<Resource> datasets, PerDatasetCount deduplicatedCount, BigDecimal estimatedPopulationSize) {
Completeness completeness = new Completeness();
for (Resource dataset : datasets) {
BigDecimal numerator = BigDecimal.valueOf(deduplicatedCount.get(dataset));
BigDecimal completenessOfDataset = numerator.divide(estimatedPopulationSize, SCALE, ROUNDING_MODE);
completeness.set(dataset, completenessOfDataset);
}
return completeness;
}

public void storeInModel(Aspect aspect, Map<Resource, Model> outputModelsMap) {
for (Resource dataset : values.keySet()) {
Collection<Resource> otherDatasets = new HashSet<>(values.keySet());
otherDatasets.remove(dataset);
Metadata.addQualityMeasurement(quantity, get(dataset), unit, dataset, variable,
otherDatasets, aspect.getIri(), outputModelsMap.get(dataset));
}
}
}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,12 @@

import com.google.common.collect.Streams;
import de.uni_jena.cs.fusion.abecto.Aspect;
import de.uni_jena.cs.fusion.abecto.ResourcePair;
import de.uni_jena.cs.fusion.abecto.measure.PerDatasetCount;
import de.uni_jena.cs.fusion.abecto.measure.SymmetricPerDatasetPairCount;
import de.uni_jena.cs.fusion.abecto.measure.PerDatasetRatio;
import de.uni_jena.cs.fusion.abecto.vocabulary.AV;
import de.uni_jena.cs.fusion.abecto.vocabulary.OM;
import org.apache.jena.query.Query;
import org.apache.jena.query.QueryExecutionFactory;
import org.apache.jena.query.ResultSet;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.Resource;

import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -96,36 +88,4 @@ Map<Resource, Model> getOutputMetaModels(Iterable<Resource> datasets) {
}
return outputMetaModelByDataset;
}

PerDatasetRatio calculateCompleteness(Iterable<ResourcePair> datasetPairs, SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) {
PerDatasetRatio completeness = new PerDatasetRatio(AV.marCompletenessThomas08, OM.one);
long totalPairwiseOverlap = calculateTotalPairwiseOverlap(datasetPairs, absoluteCoverage);
if (totalPairwiseOverlap != 0) {
BigDecimal estimatedPopulationSize = calculateEstimatedPopulationSize(datasetPairs, deduplicatedCount, totalPairwiseOverlap);
completeness.setRatioOf(deduplicatedCount, estimatedPopulationSize);
}
return completeness;
}

long calculateTotalPairwiseOverlap(Iterable<ResourcePair> datasetPairs, SymmetricPerDatasetPairCount absoluteCoverage) {
long totalPairwiseOverlap = 0L;
for (ResourcePair datasetPair : datasetPairs) {
if (absoluteCoverage.contains(datasetPair)) {
totalPairwiseOverlap += absoluteCoverage.get(datasetPair);
}
}
return totalPairwiseOverlap;
}

BigDecimal calculateEstimatedPopulationSize(Iterable<ResourcePair> datasetPairs, PerDatasetCount deduplicatedCount, long totalPairwiseOverlap) {
BigDecimal estimatedPopulationSize = BigDecimal.ZERO;
for (ResourcePair datasetPair : datasetPairs) {
BigDecimal deduplicatedCount1 = BigDecimal.valueOf(deduplicatedCount.get(datasetPair.first));
BigDecimal deduplicatedCount2 = BigDecimal.valueOf(deduplicatedCount.get(datasetPair.second));
estimatedPopulationSize = estimatedPopulationSize.add(deduplicatedCount1.multiply(deduplicatedCount2));
}
estimatedPopulationSize = estimatedPopulationSize.divide(BigDecimal.valueOf(totalPairwiseOverlap), SCALE,
RoundingMode.HALF_UP);
return estimatedPopulationSize;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public class PopulationComparisonProcessor extends ComparisonProcessor<Populatio
*/
PerDatasetCount deduplicatedCount = new PerDatasetCount(AV.deduplicatedCount, OM.one);
PerDatasetTupelRatio relativeCoverage = new PerDatasetTupelRatio(AV.relativeCoverage, OM.one);
PerDatasetRatio completeness;
Completeness completeness;

Map<Resource, Set<Resource>> unprocessedResourcesByDataset = new HashMap<>();

Expand All @@ -89,15 +89,15 @@ void compareAspectPopulation(Aspect aspect) {
measureResourceCounts();
countAndReportCoverageAndDuplicatesAndOmissions(getCorrespondenceGroups());
calculateDeduplicatedCount();
completeness = calculateCompleteness(datasetPairs, absoluteCoverage, deduplicatedCount);
completeness = Completeness.calculate(absoluteCoverage, deduplicatedCount);
calculateRelativeCoverages();

count.storeInModel(aspect, outputMetaModelByDataset);
deduplicatedCount.storeInModel(aspect, outputMetaModelByDataset);
// TODO store duplicateCount (requires definition of measure IRI)
absoluteCoverage.storeInModel(aspect, outputMetaModelByDataset);
relativeCoverage.storeInModel(aspect, outputMetaModelByDataset);
completeness.storeInModelAsComparedToAllOtherResources(aspect, outputMetaModelByDataset);
completeness.storeInModel(aspect, outputMetaModelByDataset);
reportOmissionsOfUnprocessedResources();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public class PropertyComparisonProcessor extends ComparisonProcessor<PropertyCom
* Number of distinct values in this dataset, per variable. Index: variable, affectedDataset
*/
Map<String, PerDatasetCount> distinctValuesCount;
Map<String, PerDatasetRatio> valueCompleteness;
Map<String, Completeness> valueCompleteness;

Map<Resource, Set<Resource>> unprocessedResourcesByDataset = new HashMap<>();
Map<String, Map<Resource, Map<RDFNode, Set<Resource>>>> resourcesByNonDistinctValueByDatasetByVariable = new HashMap<>();
Expand Down Expand Up @@ -547,7 +547,7 @@ protected void calculateRelativeCoverage() {

protected void calculateCompleteness() {
for (String variable : variables) {
PerDatasetRatio valueCompletenessOfVariable = calculateCompleteness(datasetPairs, absoluteValueCoverage.get(variable), distinctValuesCount.get(variable));
Completeness valueCompletenessOfVariable = Completeness.calculate(absoluteValueCoverage.get(variable), distinctValuesCount.get(variable));
valueCompletenessOfVariable.setVariable(variable);
valueCompleteness.put(variable, valueCompletenessOfVariable);
}
Expand All @@ -563,6 +563,6 @@ protected void storeMeasures() {
// TODO add value exclusion filter description to measurement description
Measure.storeMeasuresByVariableInModel(relativeValueCoverage, theAspect, outputMetaModelByDataset);
// TODO add value exclusion filter description to measurement description
PerDatasetRatio.storeMeasuresByVariableInModelAsComparedToAllOtherResources(valueCompleteness, theAspect, outputMetaModelByDataset);
Measure.storeMeasuresByVariableInModel(valueCompleteness, theAspect, outputMetaModelByDataset);
}
}

0 comments on commit 9e99d80

Please sign in to comment.