diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/ResourcePair.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/ResourcePair.java index cc12384..75189ad 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/ResourcePair.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/ResourcePair.java @@ -20,6 +20,7 @@ import org.apache.jena.rdf.model.Resource; +import java.util.Collection; import java.util.HashSet; import java.util.Set; @@ -61,6 +62,16 @@ public static ResourcePair getPair(Resource first, Resource second) { } } + public static Set getPairsBothContainedIn(Collection pairs, Collection collection) { + Set datasetPairs = new HashSet<>(pairs); + datasetPairs.removeIf(pair -> notBothContainedIn(pair, collection)); + return datasetPairs; + } + + private static boolean notBothContainedIn(ResourcePair pair, Collection collection) { + return !collection.contains(pair.first) || !collection.contains(pair.second); + } + private static boolean validOrder(Resource first, Resource second) { // do not use Resource#getURI() as it might be null for blank nodes return first.hashCode() < second.hashCode(); diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/SymmetricPerDatasetPairCount.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/AbsoluteCoverage.java similarity index 77% rename from abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/SymmetricPerDatasetPairCount.java rename to abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/AbsoluteCoverage.java index 8fced19..ba3c4ea 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/SymmetricPerDatasetPairCount.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/AbsoluteCoverage.java @@ -21,22 +21,24 @@ import de.uni_jena.cs.fusion.abecto.Aspect; import de.uni_jena.cs.fusion.abecto.Metadata; import de.uni_jena.cs.fusion.abecto.ResourcePair; +import de.uni_jena.cs.fusion.abecto.vocabulary.AV; +import de.uni_jena.cs.fusion.abecto.vocabulary.OM; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.Resource; import java.util.HashMap; import java.util.Map; -public class SymmetricPerDatasetPairCount extends Count { +public class AbsoluteCoverage extends Count { - public SymmetricPerDatasetPairCount(Resource quantity, Resource unit) { - super(quantity, unit); + public AbsoluteCoverage() { + super(AV.absoluteCoverage, OM.one); } - public static Map createMapByVariable(Iterable variables, Resource quantity, Resource unit) { - Map mapByVariable = new HashMap<>(); + public static Map createMapByVariable(Iterable variables) { + Map mapByVariable = new HashMap<>(); for (String variable : variables) { - SymmetricPerDatasetPairCount countOfVariable = new SymmetricPerDatasetPairCount(quantity, unit); + AbsoluteCoverage countOfVariable = new AbsoluteCoverage(); countOfVariable.setVariable(variable); mapByVariable.put(variable, countOfVariable); } diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Completeness.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Completeness.java index f5532c0..255ab03 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Completeness.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Completeness.java @@ -36,7 +36,7 @@ public Completeness() { super(AV.marCompletenessThomas08, OM.one); } - public static Completeness calculate(SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) { + public static Completeness calculate(AbsoluteCoverage absoluteCoverage, PerDatasetCount deduplicatedCount) { Set datasetPairs = getDatasetPairsWithSufficientData(absoluteCoverage, deduplicatedCount); long totalPairwiseOverlap = calculateTotalPairwiseOverlap(datasetPairs, absoluteCoverage); if (totalPairwiseOverlap != 0) { @@ -47,18 +47,13 @@ public static Completeness calculate(SymmetricPerDatasetPairCount absoluteCovera return new Completeness(); // empty } - private static Set getDatasetPairsWithSufficientData(SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) { - Set datasetPairs = new HashSet<>(absoluteCoverage.keySet()); + private static Set getDatasetPairsWithSufficientData(AbsoluteCoverage absoluteCoverage, PerDatasetCount deduplicatedCount) { + Set datasetPairs = absoluteCoverage.keySet(); Set datasetsWithDeduplicatedCount = deduplicatedCount.keySet(); - datasetPairs.removeIf(pair -> notBothContainedIn(pair, datasetsWithDeduplicatedCount)); - return datasetPairs; + return ResourcePair.getPairsBothContainedIn(datasetPairs, datasetsWithDeduplicatedCount); } - private static boolean notBothContainedIn(ResourcePair pair, Collection collection) { - return !collection.contains(pair.first) || !collection.contains(pair.second); - } - - private static long calculateTotalPairwiseOverlap(Iterable datasetPairs, SymmetricPerDatasetPairCount absoluteCoverage) { + private static long calculateTotalPairwiseOverlap(Iterable datasetPairs, AbsoluteCoverage absoluteCoverage) { long totalPairwiseOverlap = 0L; for (ResourcePair datasetPair : datasetPairs) { if (absoluteCoverage.contains(datasetPair)) { diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetTupelRatio.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/RelativeCoverage.java similarity index 58% rename from abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetTupelRatio.java rename to abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/RelativeCoverage.java index 8799525..b9ed8ca 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetTupelRatio.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/RelativeCoverage.java @@ -22,37 +22,36 @@ import de.uni_jena.cs.fusion.abecto.Metadata; import de.uni_jena.cs.fusion.abecto.ResourcePair; import de.uni_jena.cs.fusion.abecto.ResourceTupel; +import de.uni_jena.cs.fusion.abecto.vocabulary.AV; +import de.uni_jena.cs.fusion.abecto.vocabulary.OM; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.Resource; import java.math.BigDecimal; -import java.util.HashMap; import java.util.Map; +import java.util.Set; -public class PerDatasetTupelRatio extends Ratio { +public class RelativeCoverage extends Ratio { - public PerDatasetTupelRatio(Resource quantity, Resource unit) { - super(quantity, unit); + public RelativeCoverage() { + super(AV.relativeCoverage, OM.one); } - public static Map createMapByVariable(Iterable variables, Resource quantity, Resource unit) { - Map mapByVariable = new HashMap<>(); - for (String variable : variables) { - PerDatasetTupelRatio ratioOfVariable = new PerDatasetTupelRatio(quantity, unit); - ratioOfVariable.setVariable(variable); - mapByVariable.put(variable, ratioOfVariable); + public static RelativeCoverage calculate(AbsoluteCoverage absoluteCoverage, PerDatasetCount deduplicatedCount) { + RelativeCoverage relativeCoverage = new RelativeCoverage(); + Set datasetPairs = getDatasetPairsWithSufficientData(absoluteCoverage, deduplicatedCount); + for (ResourcePair datasetPair : datasetPairs) { + BigDecimal absoluteCoverageOfPair = BigDecimal.valueOf(absoluteCoverage.get(datasetPair)); + relativeCoverage.setRatioForTupel(absoluteCoverageOfPair, deduplicatedCount, datasetPair.first, datasetPair.second); + relativeCoverage.setRatioForTupel(absoluteCoverageOfPair, deduplicatedCount, datasetPair.second, datasetPair.first); } - return mapByVariable; + return relativeCoverage; } - public void setRatioOf(SymmetricPerDatasetPairCount numerators, PerDatasetCount denominators) { - for (ResourcePair datasetPair : numerators.keySet()) { - if (denominators.contains(datasetPair.first) && denominators.contains(datasetPair.second)) { - BigDecimal numerator = BigDecimal.valueOf(numerators.get(datasetPair)); - setRatioForTupel(numerator, denominators, datasetPair.first, datasetPair.second); - setRatioForTupel(numerator, denominators, datasetPair.second, datasetPair.first); - } - } + private static Set getDatasetPairsWithSufficientData(AbsoluteCoverage absoluteCoverage, PerDatasetCount deduplicatedCount) { + Set datasetPairsWithAbsoluteCoverage = absoluteCoverage.keySet(); + Set datasetsWithDeduplicatedCount = deduplicatedCount.keySet(); + return ResourcePair.getPairsBothContainedIn(datasetPairsWithAbsoluteCoverage, datasetsWithDeduplicatedCount); } void setRatioForTupel(BigDecimal numerator, PerDatasetCount denominators, Resource assessedDataset, Resource comparedDataset) { diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PopulationComparisonProcessor.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PopulationComparisonProcessor.java index 89a4fe2..f5ed1cd 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PopulationComparisonProcessor.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PopulationComparisonProcessor.java @@ -55,7 +55,7 @@ public class PopulationComparisonProcessor extends ComparisonProcessor> unprocessedResourcesByDataset = new HashMap<>(); @@ -89,8 +89,8 @@ void compareAspectPopulation(Aspect aspect) { measureResourceCounts(); countAndReportCoverageAndDuplicatesAndOmissions(getCorrespondenceGroups()); calculateDeduplicatedCount(); + relativeCoverage = RelativeCoverage.calculate(absoluteCoverage, deduplicatedCount); completeness = Completeness.calculate(absoluteCoverage, deduplicatedCount); - calculateRelativeCoverages(); count.storeInModel(aspect, outputMetaModelByDataset); deduplicatedCount.storeInModel(aspect, outputMetaModelByDataset); @@ -124,7 +124,6 @@ private void resetMeasures() { deduplicatedCount.reset(datasets, 0L); duplicateCount.reset(datasets, 0L); absoluteCoverage.reset(datasetPairs, 0L); - relativeCoverage.reset(datasetTupels, BigDecimal.ZERO); } private void countAndReportCoverageAndDuplicatesAndOmissions(Stream> correspondenceGroups) { @@ -227,10 +226,6 @@ private void reportOmissionsOfUnprocessedResourcesForResource(Resource dataset, } } - private void calculateRelativeCoverages() { - relativeCoverage.setRatioOf(absoluteCoverage,deduplicatedCount); - } - private void loadResourcesOfAspectAndDataset(Resource dataset) { Set distinctResources = getResourceKeys(aspect, dataset).collect(Collectors.toSet()); unprocessedResourcesByDataset.put(dataset, distinctResources); diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java index 33162fc..353e44e 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java @@ -77,8 +77,8 @@ public class PropertyComparisonProcessor extends ComparisonProcessor absoluteValueCoverage; - Map relativeValueCoverage; + Map absoluteValueCoverage; + Map relativeValueCoverage = new HashMap<>(); /** * Number of values in this dataset, per variable. */ @@ -87,7 +87,7 @@ public class PropertyComparisonProcessor extends ComparisonProcessor distinctValuesCount; - Map valueCompleteness; + Map valueCompleteness = new HashMap<>(); Map> unprocessedResourcesByDataset = new HashMap<>(); Map>>> resourcesByNonDistinctValueByDatasetByVariable = new HashMap<>(); @@ -138,10 +138,8 @@ protected void initializeMeasures() { nonDistinctValuesCount = PerDatasetCount.createMapByVariable(variables, AV.count, OM.one); setZeroForVariablesCoveredByDataset(nonDistinctValuesCount); distinctValuesCount = PerDatasetCount.createMapByVariable(variables, AV.deduplicatedCount, OM.one); - absoluteValueCoverage = SymmetricPerDatasetPairCount.createMapByVariable(variables, AV.absoluteCoverage, OM.one); + absoluteValueCoverage = AbsoluteCoverage.createMapByVariable(variables); setZeroForVariablesCoveredByDatasetPair(absoluteValueCoverage); - relativeValueCoverage = PerDatasetTupelRatio.createMapByVariable(variables, AV.relativeCoverage, OM.one); - valueCompleteness = new HashMap<>(); } protected > void setZeroForVariablesCoveredByDataset(Map measures) { @@ -328,7 +326,7 @@ protected void measureDistinctValuesCount() { protected void measureAbsoluteCoverage() { for (String variable : variables) { Map>> resourcesByDistinctValueByDataset = resourcesByDistinctValueByDatasetByVariable.get(variable); - SymmetricPerDatasetPairCount absoluteCoverageForVariable = absoluteValueCoverage.get(variable); + AbsoluteCoverage absoluteCoverageForVariable = absoluteValueCoverage.get(variable); for (ResourcePair datasetPair : datasetPairs) { if (theAspect.variableCoveredByDatasets(variable, datasetPair.first, datasetPair.second)) { Set distinctValuesOfFirstDataset = resourcesByDistinctValueByDataset.get(datasetPair.first).keySet(); @@ -538,16 +536,19 @@ protected int countDistinctValues(Set nonDistinctValues) { protected void calculateRelativeCoverage() { for (String variable : variables) { - PerDatasetTupelRatio relativeCoverageOfVariable = relativeValueCoverage.get(variable); - SymmetricPerDatasetPairCount absoluteCoverageOfVariable = absoluteValueCoverage.get(variable); - PerDatasetCount deduplicatedCountOfVariable = distinctValuesCount.get(variable); - relativeCoverageOfVariable.setRatioOf(absoluteCoverageOfVariable, deduplicatedCountOfVariable); + AbsoluteCoverage absoluteCoverageOfVariable = absoluteValueCoverage.get(variable); + PerDatasetCount distinctValuesCountOfVariable = distinctValuesCount.get(variable); + RelativeCoverage relativeCoverageOfVariable = RelativeCoverage.calculate(absoluteCoverageOfVariable, distinctValuesCountOfVariable); + relativeCoverageOfVariable.setVariable(variable); + relativeValueCoverage.put(variable, relativeCoverageOfVariable); } } protected void calculateCompleteness() { for (String variable : variables) { - Completeness valueCompletenessOfVariable = Completeness.calculate(absoluteValueCoverage.get(variable), distinctValuesCount.get(variable)); + AbsoluteCoverage absoluteCoverageOfVariable = absoluteValueCoverage.get(variable); + PerDatasetCount distinctValuesCountOfVariable = distinctValuesCount.get(variable); + Completeness valueCompletenessOfVariable = Completeness.calculate(absoluteCoverageOfVariable, distinctValuesCountOfVariable); valueCompletenessOfVariable.setVariable(variable); valueCompleteness.put(variable, valueCompletenessOfVariable); }