Skip to content

Commit

Permalink
refactor measures
Browse files Browse the repository at this point in the history
  • Loading branch information
jmkeil committed Oct 2, 2024
1 parent 9e99d80 commit 749cad4
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import org.apache.jena.rdf.model.Resource;

import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

Expand Down Expand Up @@ -61,6 +62,16 @@ public static ResourcePair getPair(Resource first, Resource second) {
}
}

public static Set<ResourcePair> getPairsBothContainedIn(Collection<ResourcePair> pairs, Collection<Resource> collection) {
Set<ResourcePair> datasetPairs = new HashSet<>(pairs);
datasetPairs.removeIf(pair -> notBothContainedIn(pair, collection));
return datasetPairs;
}

private static boolean notBothContainedIn(ResourcePair pair, Collection<Resource> collection) {
return !collection.contains(pair.first) || !collection.contains(pair.second);
}

private static boolean validOrder(Resource first, Resource second) {
// do not use Resource#getURI() as it might be null for blank nodes
return first.hashCode() < second.hashCode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,24 @@
import de.uni_jena.cs.fusion.abecto.Aspect;
import de.uni_jena.cs.fusion.abecto.Metadata;
import de.uni_jena.cs.fusion.abecto.ResourcePair;
import de.uni_jena.cs.fusion.abecto.vocabulary.AV;
import de.uni_jena.cs.fusion.abecto.vocabulary.OM;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.Resource;

import java.util.HashMap;
import java.util.Map;

public class SymmetricPerDatasetPairCount extends Count<ResourcePair> {
public class AbsoluteCoverage extends Count<ResourcePair> {

public SymmetricPerDatasetPairCount(Resource quantity, Resource unit) {
super(quantity, unit);
public AbsoluteCoverage() {
super(AV.absoluteCoverage, OM.one);
}

public static Map<String, SymmetricPerDatasetPairCount> createMapByVariable(Iterable<String> variables, Resource quantity, Resource unit) {
Map<String, SymmetricPerDatasetPairCount> mapByVariable = new HashMap<>();
public static Map<String, AbsoluteCoverage> createMapByVariable(Iterable<String> variables) {
Map<String, AbsoluteCoverage> mapByVariable = new HashMap<>();
for (String variable : variables) {
SymmetricPerDatasetPairCount countOfVariable = new SymmetricPerDatasetPairCount(quantity, unit);
AbsoluteCoverage countOfVariable = new AbsoluteCoverage();
countOfVariable.setVariable(variable);
mapByVariable.put(variable, countOfVariable);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public Completeness() {
super(AV.marCompletenessThomas08, OM.one);
}

public static Completeness calculate(SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) {
public static Completeness calculate(AbsoluteCoverage absoluteCoverage, PerDatasetCount deduplicatedCount) {
Set<ResourcePair> datasetPairs = getDatasetPairsWithSufficientData(absoluteCoverage, deduplicatedCount);
long totalPairwiseOverlap = calculateTotalPairwiseOverlap(datasetPairs, absoluteCoverage);
if (totalPairwiseOverlap != 0) {
Expand All @@ -47,18 +47,13 @@ public static Completeness calculate(SymmetricPerDatasetPairCount absoluteCovera
return new Completeness(); // empty
}

private static Set<ResourcePair> getDatasetPairsWithSufficientData(SymmetricPerDatasetPairCount absoluteCoverage, PerDatasetCount deduplicatedCount) {
Set<ResourcePair> datasetPairs = new HashSet<>(absoluteCoverage.keySet());
private static Set<ResourcePair> getDatasetPairsWithSufficientData(AbsoluteCoverage absoluteCoverage, PerDatasetCount deduplicatedCount) {
Set<ResourcePair> datasetPairs = absoluteCoverage.keySet();
Set<Resource> datasetsWithDeduplicatedCount = deduplicatedCount.keySet();
datasetPairs.removeIf(pair -> notBothContainedIn(pair, datasetsWithDeduplicatedCount));
return datasetPairs;
return ResourcePair.getPairsBothContainedIn(datasetPairs, datasetsWithDeduplicatedCount);
}

private static boolean notBothContainedIn(ResourcePair pair, Collection<Resource> collection) {
return !collection.contains(pair.first) || !collection.contains(pair.second);
}

private static long calculateTotalPairwiseOverlap(Iterable<ResourcePair> datasetPairs, SymmetricPerDatasetPairCount absoluteCoverage) {
private static long calculateTotalPairwiseOverlap(Iterable<ResourcePair> datasetPairs, AbsoluteCoverage absoluteCoverage) {
long totalPairwiseOverlap = 0L;
for (ResourcePair datasetPair : datasetPairs) {
if (absoluteCoverage.contains(datasetPair)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,37 +22,36 @@
import de.uni_jena.cs.fusion.abecto.Metadata;
import de.uni_jena.cs.fusion.abecto.ResourcePair;
import de.uni_jena.cs.fusion.abecto.ResourceTupel;
import de.uni_jena.cs.fusion.abecto.vocabulary.AV;
import de.uni_jena.cs.fusion.abecto.vocabulary.OM;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.Resource;

import java.math.BigDecimal;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

public class PerDatasetTupelRatio extends Ratio<ResourceTupel> {
public class RelativeCoverage extends Ratio<ResourceTupel> {

public PerDatasetTupelRatio(Resource quantity, Resource unit) {
super(quantity, unit);
public RelativeCoverage() {
super(AV.relativeCoverage, OM.one);
}

public static Map<String, PerDatasetTupelRatio> createMapByVariable(Iterable<String> variables, Resource quantity, Resource unit) {
Map<String, PerDatasetTupelRatio> mapByVariable = new HashMap<>();
for (String variable : variables) {
PerDatasetTupelRatio ratioOfVariable = new PerDatasetTupelRatio(quantity, unit);
ratioOfVariable.setVariable(variable);
mapByVariable.put(variable, ratioOfVariable);
public static RelativeCoverage calculate(AbsoluteCoverage absoluteCoverage, PerDatasetCount deduplicatedCount) {
RelativeCoverage relativeCoverage = new RelativeCoverage();
Set<ResourcePair> datasetPairs = getDatasetPairsWithSufficientData(absoluteCoverage, deduplicatedCount);
for (ResourcePair datasetPair : datasetPairs) {
BigDecimal absoluteCoverageOfPair = BigDecimal.valueOf(absoluteCoverage.get(datasetPair));
relativeCoverage.setRatioForTupel(absoluteCoverageOfPair, deduplicatedCount, datasetPair.first, datasetPair.second);
relativeCoverage.setRatioForTupel(absoluteCoverageOfPair, deduplicatedCount, datasetPair.second, datasetPair.first);
}
return mapByVariable;
return relativeCoverage;
}

public void setRatioOf(SymmetricPerDatasetPairCount numerators, PerDatasetCount denominators) {
for (ResourcePair datasetPair : numerators.keySet()) {
if (denominators.contains(datasetPair.first) && denominators.contains(datasetPair.second)) {
BigDecimal numerator = BigDecimal.valueOf(numerators.get(datasetPair));
setRatioForTupel(numerator, denominators, datasetPair.first, datasetPair.second);
setRatioForTupel(numerator, denominators, datasetPair.second, datasetPair.first);
}
}
private static Set<ResourcePair> getDatasetPairsWithSufficientData(AbsoluteCoverage absoluteCoverage, PerDatasetCount deduplicatedCount) {
Set<ResourcePair> datasetPairsWithAbsoluteCoverage = absoluteCoverage.keySet();
Set<Resource> datasetsWithDeduplicatedCount = deduplicatedCount.keySet();
return ResourcePair.getPairsBothContainedIn(datasetPairsWithAbsoluteCoverage, datasetsWithDeduplicatedCount);
}

void setRatioForTupel(BigDecimal numerator, PerDatasetCount denominators, Resource assessedDataset, Resource comparedDataset) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public class PopulationComparisonProcessor extends ComparisonProcessor<Populatio
/**
* Number of covered resources of another dataset, excluding duplicates.
*/
SymmetricPerDatasetPairCount absoluteCoverage = new SymmetricPerDatasetPairCount(AV.absoluteCoverage, OM.one);
AbsoluteCoverage absoluteCoverage = new AbsoluteCoverage();
/**
* Number of resources in this dataset including duplicates.
*/
Expand All @@ -68,7 +68,7 @@ public class PopulationComparisonProcessor extends ComparisonProcessor<Populatio
* Number of resources in this dataset excluding duplicates.
*/
PerDatasetCount deduplicatedCount = new PerDatasetCount(AV.deduplicatedCount, OM.one);
PerDatasetTupelRatio relativeCoverage = new PerDatasetTupelRatio(AV.relativeCoverage, OM.one);
RelativeCoverage relativeCoverage;
Completeness completeness;

Map<Resource, Set<Resource>> unprocessedResourcesByDataset = new HashMap<>();
Expand All @@ -89,8 +89,8 @@ void compareAspectPopulation(Aspect aspect) {
measureResourceCounts();
countAndReportCoverageAndDuplicatesAndOmissions(getCorrespondenceGroups());
calculateDeduplicatedCount();
relativeCoverage = RelativeCoverage.calculate(absoluteCoverage, deduplicatedCount);
completeness = Completeness.calculate(absoluteCoverage, deduplicatedCount);
calculateRelativeCoverages();

count.storeInModel(aspect, outputMetaModelByDataset);
deduplicatedCount.storeInModel(aspect, outputMetaModelByDataset);
Expand Down Expand Up @@ -124,7 +124,6 @@ private void resetMeasures() {
deduplicatedCount.reset(datasets, 0L);
duplicateCount.reset(datasets, 0L);
absoluteCoverage.reset(datasetPairs, 0L);
relativeCoverage.reset(datasetTupels, BigDecimal.ZERO);
}

private void countAndReportCoverageAndDuplicatesAndOmissions(Stream<List<Resource>> correspondenceGroups) {
Expand Down Expand Up @@ -227,10 +226,6 @@ private void reportOmissionsOfUnprocessedResourcesForResource(Resource dataset,
}
}

private void calculateRelativeCoverages() {
relativeCoverage.setRatioOf(absoluteCoverage,deduplicatedCount);
}

private void loadResourcesOfAspectAndDataset(Resource dataset) {
Set<Resource> distinctResources = getResourceKeys(aspect, dataset).collect(Collectors.toSet());
unprocessedResourcesByDataset.put(dataset, distinctResources);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ public class PropertyComparisonProcessor extends ComparisonProcessor<PropertyCom
/**
* Number of covered values of another dataset, per variable.
*/
Map<String, SymmetricPerDatasetPairCount> absoluteValueCoverage;
Map<String, PerDatasetTupelRatio> relativeValueCoverage;
Map<String, AbsoluteCoverage> absoluteValueCoverage;
Map<String, RelativeCoverage> relativeValueCoverage = new HashMap<>();
/**
* Number of values in this dataset, per variable.
*/
Expand All @@ -87,7 +87,7 @@ public class PropertyComparisonProcessor extends ComparisonProcessor<PropertyCom
* Number of distinct values in this dataset, per variable. Index: variable, affectedDataset
*/
Map<String, PerDatasetCount> distinctValuesCount;
Map<String, Completeness> valueCompleteness;
Map<String, Completeness> valueCompleteness = new HashMap<>();

Map<Resource, Set<Resource>> unprocessedResourcesByDataset = new HashMap<>();
Map<String, Map<Resource, Map<RDFNode, Set<Resource>>>> resourcesByNonDistinctValueByDatasetByVariable = new HashMap<>();
Expand Down Expand Up @@ -138,10 +138,8 @@ protected void initializeMeasures() {
nonDistinctValuesCount = PerDatasetCount.createMapByVariable(variables, AV.count, OM.one);
setZeroForVariablesCoveredByDataset(nonDistinctValuesCount);
distinctValuesCount = PerDatasetCount.createMapByVariable(variables, AV.deduplicatedCount, OM.one);
absoluteValueCoverage = SymmetricPerDatasetPairCount.createMapByVariable(variables, AV.absoluteCoverage, OM.one);
absoluteValueCoverage = AbsoluteCoverage.createMapByVariable(variables);
setZeroForVariablesCoveredByDatasetPair(absoluteValueCoverage);
relativeValueCoverage = PerDatasetTupelRatio.createMapByVariable(variables, AV.relativeCoverage, OM.one);
valueCompleteness = new HashMap<>();
}

protected <M extends Count<Resource>> void setZeroForVariablesCoveredByDataset(Map<String, M> measures) {
Expand Down Expand Up @@ -328,7 +326,7 @@ protected void measureDistinctValuesCount() {
protected void measureAbsoluteCoverage() {
for (String variable : variables) {
Map<Resource, Map<RDFNode, Set<Resource>>> resourcesByDistinctValueByDataset = resourcesByDistinctValueByDatasetByVariable.get(variable);
SymmetricPerDatasetPairCount absoluteCoverageForVariable = absoluteValueCoverage.get(variable);
AbsoluteCoverage absoluteCoverageForVariable = absoluteValueCoverage.get(variable);
for (ResourcePair datasetPair : datasetPairs) {
if (theAspect.variableCoveredByDatasets(variable, datasetPair.first, datasetPair.second)) {
Set<RDFNode> distinctValuesOfFirstDataset = resourcesByDistinctValueByDataset.get(datasetPair.first).keySet();
Expand Down Expand Up @@ -538,16 +536,19 @@ protected int countDistinctValues(Set<RDFNode> nonDistinctValues) {

protected void calculateRelativeCoverage() {
for (String variable : variables) {
PerDatasetTupelRatio relativeCoverageOfVariable = relativeValueCoverage.get(variable);
SymmetricPerDatasetPairCount absoluteCoverageOfVariable = absoluteValueCoverage.get(variable);
PerDatasetCount deduplicatedCountOfVariable = distinctValuesCount.get(variable);
relativeCoverageOfVariable.setRatioOf(absoluteCoverageOfVariable, deduplicatedCountOfVariable);
AbsoluteCoverage absoluteCoverageOfVariable = absoluteValueCoverage.get(variable);
PerDatasetCount distinctValuesCountOfVariable = distinctValuesCount.get(variable);
RelativeCoverage relativeCoverageOfVariable = RelativeCoverage.calculate(absoluteCoverageOfVariable, distinctValuesCountOfVariable);
relativeCoverageOfVariable.setVariable(variable);
relativeValueCoverage.put(variable, relativeCoverageOfVariable);
}
}

protected void calculateCompleteness() {
for (String variable : variables) {
Completeness valueCompletenessOfVariable = Completeness.calculate(absoluteValueCoverage.get(variable), distinctValuesCount.get(variable));
AbsoluteCoverage absoluteCoverageOfVariable = absoluteValueCoverage.get(variable);
PerDatasetCount distinctValuesCountOfVariable = distinctValuesCount.get(variable);
Completeness valueCompletenessOfVariable = Completeness.calculate(absoluteCoverageOfVariable, distinctValuesCountOfVariable);
valueCompletenessOfVariable.setVariable(variable);
valueCompleteness.put(variable, valueCompletenessOfVariable);
}
Expand Down

0 comments on commit 749cad4

Please sign in to comment.