diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/ResourcePair.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/ResourcePair.java index 7271482..cc12384 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/ResourcePair.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/ResourcePair.java @@ -32,6 +32,15 @@ private ResourcePair(Resource first, Resource second) { this.second = second; } + public static Set getResourcesOfPairs(Iterable pairs) { + Set resources = new HashSet<>(); + for (ResourcePair pair : pairs) { + resources.add(pair.first); + resources.add(pair.second); + } + return resources; + } + public static Set getPairsOf(Set resources) { Set pairs = new HashSet<>(); for (Resource first : resources) { diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Completeness.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Completeness.java new file mode 100644 index 0000000..f5532c0 --- /dev/null +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Completeness.java @@ -0,0 +1,101 @@ +/*- + * Copyright © 2019-2022 Heinz Nixdorf Chair for Distributed Information Systems, + * Friedrich Schiller University Jena (http://www.fusion.uni-jena.de/) + * Copyright © 2023-2024 Jan Martin Keil (jan-martin.keil@uni-jena.de) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +-*/ + +package de.uni_jena.cs.fusion.abecto.measure; + +import de.uni_jena.cs.fusion.abecto.Aspect; +import de.uni_jena.cs.fusion.abecto.Metadata; +import de.uni_jena.cs.fusion.abecto.ResourcePair; +import de.uni_jena.cs.fusion.abecto.vocabulary.AV; +import de.uni_jena.cs.fusion.abecto.vocabulary.OM; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.Resource; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.*; + +public class Completeness extends Ratio { + + public Completeness() { + super(AV.marCompletenessThomas08, OM.one); + } + + public static Completeness calculate(SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) { + Set datasetPairs = getDatasetPairsWithSufficientData(absoluteCoverage, deduplicatedCount); + long totalPairwiseOverlap = calculateTotalPairwiseOverlap(datasetPairs, absoluteCoverage); + if (totalPairwiseOverlap != 0) { + BigDecimal estimatedPopulationSize = calculateEstimatedPopulationSize(datasetPairs, deduplicatedCount, totalPairwiseOverlap); + Set datasets = ResourcePair.getResourcesOfPairs(datasetPairs); + return calculateCompleteness(datasets, deduplicatedCount, estimatedPopulationSize); + } + return new Completeness(); // empty + } + + private static Set getDatasetPairsWithSufficientData(SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) { + Set datasetPairs = new HashSet<>(absoluteCoverage.keySet()); + Set datasetsWithDeduplicatedCount = deduplicatedCount.keySet(); + datasetPairs.removeIf(pair -> notBothContainedIn(pair, datasetsWithDeduplicatedCount)); + return datasetPairs; + } + + private static boolean notBothContainedIn(ResourcePair pair, Collection collection) { + return !collection.contains(pair.first) || !collection.contains(pair.second); + } + + private static long calculateTotalPairwiseOverlap(Iterable datasetPairs, SymmetricPerDatasetPairCount absoluteCoverage) { + long totalPairwiseOverlap = 0L; + for (ResourcePair datasetPair : datasetPairs) { + if (absoluteCoverage.contains(datasetPair)) { + totalPairwiseOverlap += absoluteCoverage.get(datasetPair); + } + } + return totalPairwiseOverlap; + } + + private static BigDecimal calculateEstimatedPopulationSize(Iterable datasetPairs, PerDatasetCount deduplicatedCount, long totalPairwiseOverlap) { + BigDecimal estimatedPopulationSize = BigDecimal.ZERO; + for (ResourcePair datasetPair : datasetPairs) { + BigDecimal deduplicatedCount1 = BigDecimal.valueOf(deduplicatedCount.get(datasetPair.first)); + BigDecimal deduplicatedCount2 = BigDecimal.valueOf(deduplicatedCount.get(datasetPair.second)); + estimatedPopulationSize = estimatedPopulationSize.add(deduplicatedCount1.multiply(deduplicatedCount2)); + } + estimatedPopulationSize = estimatedPopulationSize.divide(BigDecimal.valueOf(totalPairwiseOverlap), SCALE, + RoundingMode.HALF_UP); + return estimatedPopulationSize; + } + + private static Completeness calculateCompleteness(Iterable datasets, PerDatasetCount deduplicatedCount, BigDecimal estimatedPopulationSize) { + Completeness completeness = new Completeness(); + for (Resource dataset : datasets) { + BigDecimal numerator = BigDecimal.valueOf(deduplicatedCount.get(dataset)); + BigDecimal completenessOfDataset = numerator.divide(estimatedPopulationSize, SCALE, ROUNDING_MODE); + completeness.set(dataset, completenessOfDataset); + } + return completeness; + } + + public void storeInModel(Aspect aspect, Map outputModelsMap) { + for (Resource dataset : values.keySet()) { + Collection otherDatasets = new HashSet<>(values.keySet()); + otherDatasets.remove(dataset); + Metadata.addQualityMeasurement(quantity, get(dataset), unit, dataset, variable, + otherDatasets, aspect.getIri(), outputModelsMap.get(dataset)); + } + } +} diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetRatio.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetRatio.java deleted file mode 100644 index 27cef6a..0000000 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetRatio.java +++ /dev/null @@ -1,54 +0,0 @@ -/*- - * Copyright © 2019-2022 Heinz Nixdorf Chair for Distributed Information Systems, - * Friedrich Schiller University Jena (http://www.fusion.uni-jena.de/) - * Copyright © 2023-2024 Jan Martin Keil (jan-martin.keil@uni-jena.de) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. --*/ - -package de.uni_jena.cs.fusion.abecto.measure; - -import de.uni_jena.cs.fusion.abecto.Aspect; -import de.uni_jena.cs.fusion.abecto.Metadata; -import org.apache.jena.rdf.model.Model; -import org.apache.jena.rdf.model.Resource; - -import java.util.Collection; -import java.util.HashSet; -import java.util.Map; - -public class PerDatasetRatio extends Ratio { - public PerDatasetRatio(Resource quantity, Resource unit) { - super(quantity, unit); - } - - public static void storeMeasuresByVariableInModelAsComparedToAllOtherResources(Map measuresByVariable, Aspect aspect, Map outputModelsMap) { - for (PerDatasetRatio measure : measuresByVariable.values()) { - measure.storeInModelAsComparedToAllOtherResources(aspect, outputModelsMap); - } - } - - @Override - public void storeInModel(Aspect aspect, Map outputModelsMap) { - throw new UnsupportedOperationException(); // TODO - } - - public void storeInModelAsComparedToAllOtherResources(Aspect aspect, Map outputModelsMap) { - for (Resource dataset : values.keySet()) { - Collection otherDatasets = new HashSet<>(values.keySet()); - otherDatasets.remove(dataset); - Metadata.addQualityMeasurement(quantity, get(dataset), unit, dataset, variable, - otherDatasets, aspect.getIri(), outputModelsMap.get(dataset)); - } - } -} diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/ComparisonProcessor.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/ComparisonProcessor.java index 2858a4a..86eef44 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/ComparisonProcessor.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/ComparisonProcessor.java @@ -20,20 +20,12 @@ import com.google.common.collect.Streams; import de.uni_jena.cs.fusion.abecto.Aspect; -import de.uni_jena.cs.fusion.abecto.ResourcePair; -import de.uni_jena.cs.fusion.abecto.measure.PerDatasetCount; -import de.uni_jena.cs.fusion.abecto.measure.SymmetricPerDatasetPairCount; -import de.uni_jena.cs.fusion.abecto.measure.PerDatasetRatio; -import de.uni_jena.cs.fusion.abecto.vocabulary.AV; -import de.uni_jena.cs.fusion.abecto.vocabulary.OM; import org.apache.jena.query.Query; import org.apache.jena.query.QueryExecutionFactory; import org.apache.jena.query.ResultSet; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.Resource; -import java.math.BigDecimal; -import java.math.RoundingMode; import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -96,36 +88,4 @@ Map getOutputMetaModels(Iterable datasets) { } return outputMetaModelByDataset; } - - PerDatasetRatio calculateCompleteness(Iterable datasetPairs, SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) { - PerDatasetRatio completeness = new PerDatasetRatio(AV.marCompletenessThomas08, OM.one); - long totalPairwiseOverlap = calculateTotalPairwiseOverlap(datasetPairs, absoluteCoverage); - if (totalPairwiseOverlap != 0) { - BigDecimal estimatedPopulationSize = calculateEstimatedPopulationSize(datasetPairs, deduplicatedCount, totalPairwiseOverlap); - completeness.setRatioOf(deduplicatedCount, estimatedPopulationSize); - } - return completeness; - } - - long calculateTotalPairwiseOverlap(Iterable datasetPairs, SymmetricPerDatasetPairCount absoluteCoverage) { - long totalPairwiseOverlap = 0L; - for (ResourcePair datasetPair : datasetPairs) { - if (absoluteCoverage.contains(datasetPair)) { - totalPairwiseOverlap += absoluteCoverage.get(datasetPair); - } - } - return totalPairwiseOverlap; - } - - BigDecimal calculateEstimatedPopulationSize(Iterable datasetPairs, PerDatasetCount deduplicatedCount, long totalPairwiseOverlap) { - BigDecimal estimatedPopulationSize = BigDecimal.ZERO; - for (ResourcePair datasetPair : datasetPairs) { - BigDecimal deduplicatedCount1 = BigDecimal.valueOf(deduplicatedCount.get(datasetPair.first)); - BigDecimal deduplicatedCount2 = BigDecimal.valueOf(deduplicatedCount.get(datasetPair.second)); - estimatedPopulationSize = estimatedPopulationSize.add(deduplicatedCount1.multiply(deduplicatedCount2)); - } - estimatedPopulationSize = estimatedPopulationSize.divide(BigDecimal.valueOf(totalPairwiseOverlap), SCALE, - RoundingMode.HALF_UP); - return estimatedPopulationSize; - } } diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PopulationComparisonProcessor.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PopulationComparisonProcessor.java index 6022606..89a4fe2 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PopulationComparisonProcessor.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PopulationComparisonProcessor.java @@ -69,7 +69,7 @@ public class PopulationComparisonProcessor extends ComparisonProcessor> unprocessedResourcesByDataset = new HashMap<>(); @@ -89,7 +89,7 @@ void compareAspectPopulation(Aspect aspect) { measureResourceCounts(); countAndReportCoverageAndDuplicatesAndOmissions(getCorrespondenceGroups()); calculateDeduplicatedCount(); - completeness = calculateCompleteness(datasetPairs, absoluteCoverage, deduplicatedCount); + completeness = Completeness.calculate(absoluteCoverage, deduplicatedCount); calculateRelativeCoverages(); count.storeInModel(aspect, outputMetaModelByDataset); @@ -97,7 +97,7 @@ void compareAspectPopulation(Aspect aspect) { // TODO store duplicateCount (requires definition of measure IRI) absoluteCoverage.storeInModel(aspect, outputMetaModelByDataset); relativeCoverage.storeInModel(aspect, outputMetaModelByDataset); - completeness.storeInModelAsComparedToAllOtherResources(aspect, outputMetaModelByDataset); + completeness.storeInModel(aspect, outputMetaModelByDataset); reportOmissionsOfUnprocessedResources(); } diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java index eef2435..33162fc 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java @@ -87,7 +87,7 @@ public class PropertyComparisonProcessor extends ComparisonProcessor distinctValuesCount; - Map valueCompleteness; + Map valueCompleteness; Map> unprocessedResourcesByDataset = new HashMap<>(); Map>>> resourcesByNonDistinctValueByDatasetByVariable = new HashMap<>(); @@ -547,7 +547,7 @@ protected void calculateRelativeCoverage() { protected void calculateCompleteness() { for (String variable : variables) { - PerDatasetRatio valueCompletenessOfVariable = calculateCompleteness(datasetPairs, absoluteValueCoverage.get(variable), distinctValuesCount.get(variable)); + Completeness valueCompletenessOfVariable = Completeness.calculate(absoluteValueCoverage.get(variable), distinctValuesCount.get(variable)); valueCompletenessOfVariable.setVariable(variable); valueCompleteness.put(variable, valueCompletenessOfVariable); } @@ -563,6 +563,6 @@ protected void storeMeasures() { // TODO add value exclusion filter description to measurement description Measure.storeMeasuresByVariableInModel(relativeValueCoverage, theAspect, outputMetaModelByDataset); // TODO add value exclusion filter description to measurement description - PerDatasetRatio.storeMeasuresByVariableInModelAsComparedToAllOtherResources(valueCompleteness, theAspect, outputMetaModelByDataset); + Measure.storeMeasuresByVariableInModel(valueCompleteness, theAspect, outputMetaModelByDataset); } }