diff --git a/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/ComparisonBenchmarkDataSupplier.java b/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/ComparisonBenchmarkDataSupplier.java index d61c020..14ae6b7 100644 --- a/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/ComparisonBenchmarkDataSupplier.java +++ b/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/ComparisonBenchmarkDataSupplier.java @@ -58,20 +58,20 @@ public Stream getResourceKeys(Resource sample) throws NullPointerExcep return IntStream.range(0, sampleSize).map(localId -> localId + sampleId * sampleSize).mapToObj(i -> ResourceFactory.createResource(Integer.toString(i))); } - public Map> selectResourceValues(Resource resource, Resource sample, - Collection variables) { + public Map> getValuesByVariable(Resource resource, Resource sample, + Collection variables) { int resourceId = Integer.parseInt(resource.getURI()); int sampleId = Integer.parseInt(sample.getURI()); if (resourceId / sampleSize == sampleId) { - return selectResourceValues(resourceId, sampleId, variables.iterator().next()); + return getValuesByVariable(resourceId, sampleId, variables.iterator().next()); } else { return null; } } - public Map> selectResourceValues(int resourceId, int sampleId, - String variable) { - Set valuesSet = new HashSet<>(1); // Note: must be mutable + public Map> getValuesByVariable(int resourceId, int sampleId, + String variable) { + Set valuesSet = new HashSet<>(1); // Note: must be mutable TODO not necessary after not ignoring wrong values for measures anymore if (resourceId % sampleSize >= sampleSize * errorRate) { valuesSet.add(correctValue); } else { @@ -80,15 +80,15 @@ public Map> selectResourceValues(int resourceId, int sample return Collections.singletonMap(variable, valuesSet); } - public Map>> selectResourceValues(Collection resources, - Resource sample, List variables) { + public Map>> getValuesByVariableResource(Collection resources, + Resource sample, List variables) { int sampleId = Integer.parseInt(sample.getURI()); String variable = variables.iterator().next(); Map>> resourceValues = new HashMap<>(); for (Resource resource : resources) { int resourceId = Integer.parseInt(resource.getURI()); if (resourceId / sampleSize == sampleId) { - resourceValues.put(resource, selectResourceValues(resourceId, sampleId, variable)); + resourceValues.put(resource, getValuesByVariable(resourceId, sampleId, variable)); } } return resourceValues; diff --git a/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/PopulationComparisonProcessorBenchmark.java b/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/PopulationComparisonProcessorBenchmark.java index f66ab39..193f84b 100644 --- a/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/PopulationComparisonProcessorBenchmark.java +++ b/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/PopulationComparisonProcessorBenchmark.java @@ -89,6 +89,7 @@ public Stream getResourceKeys(Aspect aspect, Resource dataset) throws return this.dataSupplier.getResourceKeys(dataset); } + @Override public Stream> getCorrespondenceGroups() { return this.dataSupplier.getCorrespondenceGroups(); } diff --git a/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/PropertyComparisonProcessorBenchmark.java b/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/PropertyComparisonProcessorBenchmark.java index 081a3b5..8a377f8 100644 --- a/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/PropertyComparisonProcessorBenchmark.java +++ b/abecto-benchmark/src/main/java/de/uni_jena/cs/fusion/abecto/benchmark/PropertyComparisonProcessorBenchmark.java @@ -88,20 +88,11 @@ public Stream getResourceKeys(@SuppressWarnings("unused") Aspect aspec return this.dataSupplier.getResourceKeys(dataset); } - public Map> selectResourceValues(Resource resource, Resource dataset, - @SuppressWarnings("unused") Aspect aspect, - Collection variables) { - return this.dataSupplier.selectResourceValues(resource, dataset, variables); - } - - public Map>> selectResourceValues(Collection resources, - Resource dataset, - @SuppressWarnings("unused") Aspect aspect, - List variables) { - return this.dataSupplier.selectResourceValues(resources, dataset, variables); + @Override + protected Map> getValuesByVariable(Resource dataset, Resource resource) { + return this.dataSupplier.getValuesByVariable(resource, dataset, variables); } - @Override public Stream> getCorrespondenceGroups() { return this.dataSupplier.getCorrespondenceGroups(); diff --git a/abecto-benchmark/src/test/java/de/uni_jena/cs/fusion/abecto/benchmark/ComparisonBenchmarkDataSupplierTest.java b/abecto-benchmark/src/test/java/de/uni_jena/cs/fusion/abecto/benchmark/ComparisonBenchmarkDataSupplierTest.java index 834d396..c656e41 100644 --- a/abecto-benchmark/src/test/java/de/uni_jena/cs/fusion/abecto/benchmark/ComparisonBenchmarkDataSupplierTest.java +++ b/abecto-benchmark/src/test/java/de/uni_jena/cs/fusion/abecto/benchmark/ComparisonBenchmarkDataSupplierTest.java @@ -55,9 +55,9 @@ public void supplierTest(int datasetCount) { int rightValuesSingle = 0, wrongValuesSingle = 0, rightValuesCollection = 0, wrongValuesCollection = 0; for (Resource resource : population[i]) { // single resource method - Assertions.assertEquals(1, supplier.selectResourceValues(resource, dataset.get(i), + Assertions.assertEquals(1, supplier.getValuesByVariable(resource, dataset.get(i), Collections.singletonList("var")).get("var").size()); - if (supplier.selectResourceValues(resource, dataset.get(i), Collections.singletonList("var")).get( + if (supplier.getValuesByVariable(resource, dataset.get(i), Collections.singletonList("var")).get( "var").contains(ComparisonBenchmarkDataSupplier.correctValue)) { rightValuesSingle++; } else { @@ -65,11 +65,11 @@ public void supplierTest(int datasetCount) { } // resource collection method - Assertions.assertEquals(1, supplier.selectResourceValues(Collections.singletonList(resource), + Assertions.assertEquals(1, supplier.getValuesByVariableResource(Collections.singletonList(resource), dataset.get(i), Collections.singletonList("var")).get(resource).size()); - Assertions.assertEquals(1, supplier.selectResourceValues(Collections.singletonList(resource), + Assertions.assertEquals(1, supplier.getValuesByVariableResource(Collections.singletonList(resource), dataset.get(i), Collections.singletonList("var")).get(resource).get("var").size()); - if (supplier.selectResourceValues(Collections.singletonList(resource), dataset.get(i), + if (supplier.getValuesByVariableResource(Collections.singletonList(resource), dataset.get(i), Collections.singletonList("var")).get(resource).get("var").contains(ComparisonBenchmarkDataSupplier.correctValue)) { rightValuesCollection++; } else { diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/Aspect.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/Aspect.java index 2c0283d..03c9e6a 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/Aspect.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/Aspect.java @@ -57,337 +57,339 @@ public class Aspect { - final static Logger log = LoggerFactory.getLogger(Aspect.class); - - public static String path2String(Path value) { - return PathWriter.asString(value, Vocabularies.getDefaultPrologue()); - } - - /** - * Returns an {@link Aspect} determined by a given IRI in the given - * configuration {@link Model}. - * - * @param configurationModel the configuration {@link Model} containing the - * aspect definitions - * @param aspectIri the IRI of the {@link Aspect} to return - * @return the {@link Aspect} - * @throws NoSuchElementException if there is no {@link Aspect} with the given - * IRI - * @throws ToManyElementsException if there are multiple pattern defined for the - * same {@link Aspect} and dataset - */ - public static Aspect getAspect(Model configurationModel, Resource aspectIri) - throws NoSuchElementException, ToManyElementsException { - String keyVariableName = Models - .assertOne(configurationModel.listObjectsOfProperty(aspectIri, AV.keyVariableName)).asLiteral() - .getString(); - - Aspect aspect = new Aspect(aspectIri, keyVariableName); - - // add patterns - for (Resource aspectPatter : configurationModel.listResourcesWithProperty(AV.ofAspect, aspectIri).toList()) { - for (Resource dataset : configurationModel.listObjectsOfProperty(aspectPatter, AV.associatedDataset) - .mapWith(RDFNode::asResource).toList()) { - Query pattern = convertStringToQuery(assertOne(configurationModel - .listObjectsOfProperty(aspectPatter, AV.definingQuery)) - .asLiteral().getString()); - if (!pattern.isSelectType()) { - throw new IllegalArgumentException( - String.format("Pattern of aspect %s and dataset %s is not a SPARQL Select Query.", - aspectIri.getURI(), dataset.getURI())); - } - aspect.setPattern(dataset, pattern); - } - } - - return aspect; - } - - private static Query convertStringToQuery(String s) { - try { - return QueryFactory.create(s, Syntax.syntaxSPARQL); - } catch (QueryException e) { - throw new DatatypeFormatException("Not a valid SPARQL query.", e); - } - } - - /** - * Returns all {@link Aspect Aspects} in the given configuration {@link Model}. - * - * @param configurationModel the configuration {@link Model} containing the - * aspect definitions - * @return the {@link Aspect Aspects} - */ - public static Collection getAspects(Model configurationModel) { - // init aspect list - Collection aspects = new ArrayList<>(); - // get aspects - configurationModel.listResourcesWithProperty(RDF.type, AV.Aspect) - .mapWith(aspect -> getAspect(configurationModel, aspect)).forEach(aspects::add); - return aspects; - } - - /** - * @throws NullPointerException if no pattern is defined for the given dataset - */ - public static Optional>> getResource(Aspect aspect, Resource dataset, Resource keyValue, - Model datasetModels) throws NullPointerException { - Query query = SelectBuilder.rewrite(aspect.getPattern(dataset).cloneQuery(), - Collections.singletonMap(aspect.getKeyVariable(), keyValue.asNode())); - ResultSet results = QueryExecutionFactory.create(query, datasetModels).execSelect(); - if (results.hasNext()) { - Map> values = new HashMap<>(); - for (String varName : results.getResultVars()) { - if (!varName.equals(aspect.getKeyVariableName())) { - values.put(varName, new HashSet<>()); - } - } - while (results.hasNext()) { - QuerySolution result = results.next(); - for (Entry> entry : values.entrySet()) { - RDFNode value = result.get(entry.getKey()); - if (value != null) { - entry.getValue().add(value); - } - } - } - return Optional.of(values); - } else { - return Optional.empty(); - } - } - - /** - * Returns an index of all resources of a given {@link Aspect} and a given - * dataset by its variables and by the variable values. {@code null} values will - * be ignored. - * - * @param aspect the aspect describing the resources to index - * @param dataset the dataset to index the resources for - * @param variables the variables to use for indexing - * @param datasetModels the (union of) {@link Model Model(s)} containing the - * {@link Resource Resources} to index - * @return - * - * @throws NullPointerException if no pattern is defined for the given dataset - */ - public static Map>> getResourceIndex(Aspect aspect, Resource dataset, - Collection variables, Model datasetModels) throws NullPointerException { - return getResourceIndex(aspect, dataset, variables, datasetModels, Functions.identity()); - } - - /** - * Returns an index of all resources of a given {@link Aspect} and a given - * dataset by its variables and by the variable values. {@code null} values will - * be ignored. The variable values will be modified by the provided - * {@link Function} {@code modifier}. - *

- * For example, the {@code modifier} could be used to convert all characters of - * String variable values to lowercase characters. - * - * @param Type of the variable values after application of the - * {@code modifier} - * @param aspect the aspect describing the resources to index - * @param dataset the dataset to index the resources for - * @param variables the variables to use for indexing - * @param datasetModels the (union of) {@link Model Model(s)} containing the - * {@link Resource Resources} to index - * @param modifier the {@link Function} to modify the variable values - * before building up the index - * @return - * - * @throws NullPointerException if no pattern is defined for the given dataset - */ - public static Map>> getResourceIndex(Aspect aspect, Resource dataset, - Collection variables, Model datasetModels, Function modifier) - throws NullPointerException { - Map>> index = new HashMap<>(); - - for (String variable : variables) { - index.put(variable, new HashMap<>()); - } - - Query query = aspect.getPattern(dataset); - - // remove not needed variables from query - query = retainVariables(query, aspect.keyVariable, variables); - - ResultSet results = QueryExecutionFactory.create(query, datasetModels).execSelect(); - while (results.hasNext()) { - QuerySolution result = results.next(); - Resource keyValue = result.getResource(aspect.getKeyVariableName()); - for (String variable : variables) { - if (result.contains(variable)) { - index.get(variable).computeIfAbsent(modifier.apply(result.get(variable)), k -> new HashSet<>()) - .add(keyValue); - } - } - } - return index; - } - - /** - * Removes all result variables from a {@link Query} except of variables given - * in {@code keyVariable} and {@code variables}. - */ - static Query retainVariables(Query query, Var keyVariable, Collection variables) { - // TODO HOTFIX for https://issues.apache.org/jira/browse/JENA-2335 - return query; - // Op op = new AlgebraGenerator().compile(query); - // op = new OpProject(op, - // query.getResultVars().stream().map(Var::alloc).filter(v -> - // v.equals(keyVariable) || - // variables.contains(v.getName())).collect(Collectors.toList())); - // return OpAsQuery.asQuery(op); - } - - /** - * Returns a hash index on multiple variables for {@link Resource Resources} of - * a given {@link Aspect}. Resources with unbound variables are omitted. - */ - public static Map> getResourceHashIndex(Aspect aspect, Resource dataset, - List variables, Model datasetModels) { - Map> index = new HashMap<>(); - - Query query = aspect.getPattern(dataset); - List resultVars = query.getResultVars(); - if (!resultVars.containsAll(variables)) { // skip if unknown variable - log.warn("Failed to create resources hash index of aspect {} and dataset {}: Unknown variable(s): {}", - aspect.getIri(), dataset, variables.stream().filter(resultVars::contains).toArray()); - return index; - } - - // remove not needed variables from query - query = retainVariables(query, aspect.keyVariable, variables); - - ResultSet results = QueryExecutionFactory.create(query, datasetModels).execSelect(); - while (results.hasNext()) { - QuerySolution result = results.next(); - Resource keyValue = result.getResource(aspect.getKeyVariableName()); - if (variables.stream().allMatch(result::contains)) { // skip resources with unbound variables - Values valueArray = new Values( - variables.stream().map(result::get).toArray(RDFNode[]::new)); - index.computeIfAbsent(valueArray, k -> new HashSet<>()).add(keyValue); - } - } - return index; - } - - - - - /** - * Returns a new {@link Collection} instance containing the given resources that - * are covered by the pattern for the given dataset in the given {@link Model}. - */ - public Collection getResourcesInDataset(Collection resources, Resource dataset, Model model) { - if (!this.patternByDataset.containsKey(dataset)) { - return Collections.emptySet(); - } - Collection intersection = new ArrayList<>(); - - Query pattern = this.getPattern(dataset); - for (Resource resource : resources) { - Query query = SelectBuilder.rewrite(pattern.cloneQuery(), - Collections.singletonMap(this.keyVariable, resource.asNode())); - query.setQueryAskType(); - if (QueryExecutionFactory.create(query, model).execAsk()) { - intersection.add(resource); - } - } - return intersection; - } - - private final Resource iri; - - private final String keyVariableName; - - private final Var keyVariable; - - private final Map patternByDataset = new HashMap<>(); - - public Aspect(Resource iri, String keyVariableName) { - this.iri = iri; - this.keyVariableName = keyVariableName; - this.keyVariable = Var.alloc(keyVariableName); - } - - public Resource getIri() { - return this.iri; - } - - public Var getKeyVariable() { - return keyVariable; - } - - public String getKeyVariableName() { - return keyVariableName; - } - - /** - * Returns the pattern for the given dataset. - * - * @param dataset - * @return the pattern for the given dataset - * - * @throws NullPointerException if no pattern is defined for the given dataset - */ - public Query getPattern(Resource dataset) throws NullPointerException { - return Objects.requireNonNull(patternByDataset.get(dataset), - () -> String.format("Pattern of aspect %s for dataset %s not defined.", this.keyVariableName, dataset)); - } - - public Set getDatasets() { - return new HashSet<>(patternByDataset.keySet()); - } - - public boolean coversDataset(Resource dataset) { - return patternByDataset.containsKey(dataset); - } - - public Aspect setPattern(Resource dataset, Query pattern) { - patternByDataset.put(dataset, pattern); - return this; - } - - private Map> variablePathsByDataset = new HashMap<>(); - - public Path getVarPath(Resource dataset, String variable) { - return variablePathsByDataset.get(dataset).get(variable); - } - - public String getVarPathAsString(Resource dataset, String variable) { - return path2String(this.getVarPath(dataset, variable)); - } - - /** - * Determines the property paths from the key variable of this {@link Aspect} to - * other variables for all given dataset and adds them to the given - * {@link Model}. - * - * @param model the model to add the determined paths - */ - public void determineVarPaths(Model model) { - for (Resource dataset : patternByDataset.keySet()) { - try { - VarPathsExtractionVisitor visitor = new VarPathsExtractionVisitor(); - Query query = this.getPattern(dataset); - query.getQueryPattern().visit(visitor); - // get (blank-)node of the relevant aspect pattern - Resource aspectPattern = model.listResourcesWithProperty(AV.associatedDataset, dataset) - .filterKeep(r -> r.hasProperty(AV.ofAspect, this.iri)).next(); - this.variablePathsByDataset.put(dataset, visitor.getPaths(keyVariable)); - for (Entry variablePath : this.variablePathsByDataset.get(dataset).entrySet()) { - aspectPattern.addProperty(AV.hasVariablePath, model.createResource(AV.VariablePath)// - .addLiteral(AV.variableName, variablePath.getKey())// - .addProperty(AV.propertyPath, path2String(variablePath.getValue()))); - } - } catch (IllegalArgumentException e) { - log.warn(String.format( - "Failed to determine variables paths for aspect %s (key variable \"%s\") and dataset \"%s\".", - this.iri, this.keyVariableName, dataset), e); - } - } - } + final static Logger log = LoggerFactory.getLogger(Aspect.class); + private final Resource iri; + private final String keyVariableName; + private final Var keyVariable; + private final Map patternByDataset = new HashMap<>(); + Map> coveredVariablesByDataset = new HashMap(); + private Map> variablePathsByDataset = new HashMap<>(); + + public Aspect(Resource iri, String keyVariableName) { + this.iri = iri; + this.keyVariableName = keyVariableName; + this.keyVariable = Var.alloc(keyVariableName); + } + + public static String path2String(Path value) { + return PathWriter.asString(value, Vocabularies.getDefaultPrologue()); + } + + /** + * Returns an {@link Aspect} determined by a given IRI in the given + * configuration {@link Model}. + * + * @param configurationModel the configuration {@link Model} containing the + * aspect definitions + * @param aspectIri the IRI of the {@link Aspect} to return + * @return the {@link Aspect} + * @throws NoSuchElementException if there is no {@link Aspect} with the given + * IRI + * @throws ToManyElementsException if there are multiple pattern defined for the + * same {@link Aspect} and dataset + */ + public static Aspect getAspect(Model configurationModel, Resource aspectIri) + throws NoSuchElementException, ToManyElementsException { + String keyVariableName = Models + .assertOne(configurationModel.listObjectsOfProperty(aspectIri, AV.keyVariableName)).asLiteral() + .getString(); + + Aspect aspect = new Aspect(aspectIri, keyVariableName); + + // add patterns + for (Resource aspectPatter : configurationModel.listResourcesWithProperty(AV.ofAspect, aspectIri).toList()) { + for (Resource dataset : configurationModel.listObjectsOfProperty(aspectPatter, AV.associatedDataset) + .mapWith(RDFNode::asResource).toList()) { + Query pattern = convertStringToQuery(assertOne(configurationModel + .listObjectsOfProperty(aspectPatter, AV.definingQuery)) + .asLiteral().getString()); + if (!pattern.isSelectType()) { + throw new IllegalArgumentException( + String.format("Pattern of aspect %s and dataset %s is not a SPARQL Select Query.", + aspectIri.getURI(), dataset.getURI())); + } + aspect.setPattern(dataset, pattern); + } + } + + return aspect; + } + + private static Query convertStringToQuery(String s) { + try { + return QueryFactory.create(s, Syntax.syntaxSPARQL); + } catch (QueryException e) { + throw new DatatypeFormatException("Not a valid SPARQL query.", e); + } + } + + /** + * Returns all {@link Aspect Aspects} in the given configuration {@link Model}. + * + * @param configurationModel the configuration {@link Model} containing the + * aspect definitions + * @return the {@link Aspect Aspects} + */ + public static Collection getAspects(Model configurationModel) { + // init aspect list + Collection aspects = new ArrayList<>(); + // get aspects + configurationModel.listResourcesWithProperty(RDF.type, AV.Aspect) + .mapWith(aspect -> getAspect(configurationModel, aspect)).forEach(aspects::add); + return aspects; + } + + /** + * @throws NullPointerException if no pattern is defined for the given dataset + */ + public static Optional>> getResource(Aspect aspect, Resource dataset, Resource keyValue, + Model datasetModels) throws NullPointerException { + Query query = SelectBuilder.rewrite(aspect.getPattern(dataset).cloneQuery(), + Collections.singletonMap(aspect.getKeyVariable(), keyValue.asNode())); + ResultSet results = QueryExecutionFactory.create(query, datasetModels).execSelect(); + if (results.hasNext()) { + Map> values = new HashMap<>(); + for (String varName : results.getResultVars()) { + if (!varName.equals(aspect.getKeyVariableName())) { + values.put(varName, new HashSet<>()); + } + } + while (results.hasNext()) { + QuerySolution result = results.next(); + for (Entry> entry : values.entrySet()) { + RDFNode value = result.get(entry.getKey()); + if (value != null) { + entry.getValue().add(value); + } + } + } + return Optional.of(values); + } else { + return Optional.empty(); + } + } + + /** + * Returns an index of all resources of a given {@link Aspect} and a given + * dataset by its variables and by the variable values. {@code null} values will + * be ignored. + * + * @param aspect the aspect describing the resources to index + * @param dataset the dataset to index the resources for + * @param variables the variables to use for indexing + * @param datasetModels the (union of) {@link Model Model(s)} containing the + * {@link Resource Resources} to index + * @throws NullPointerException if no pattern is defined for the given dataset + */ + public static Map>> getResourceIndex(Aspect aspect, Resource dataset, + Collection variables, Model datasetModels) throws NullPointerException { + return getResourceIndex(aspect, dataset, variables, datasetModels, Functions.identity()); + } + + /** + * Returns an index of all resources of a given {@link Aspect} and a given + * dataset by its variables and by the variable values. {@code null} values will + * be ignored. The variable values will be modified by the provided + * {@link Function} {@code modifier}. + *

+ * For example, the {@code modifier} could be used to convert all characters of + * String variable values to lowercase characters. + * + * @param Type of the variable values after application of the + * {@code modifier} + * @param aspect the aspect describing the resources to index + * @param dataset the dataset to index the resources for + * @param variables the variables to use for indexing + * @param datasetModels the (union of) {@link Model Model(s)} containing the + * {@link Resource Resources} to index + * @param modifier the {@link Function} to modify the variable values + * before building up the index + * @throws NullPointerException if no pattern is defined for the given dataset + */ + public static Map>> getResourceIndex(Aspect aspect, Resource dataset, + Collection variables, Model datasetModels, Function modifier) + throws NullPointerException { + Map>> index = new HashMap<>(); + + for (String variable : variables) { + index.put(variable, new HashMap<>()); + } + + Query query = aspect.getPattern(dataset); + + // remove not needed variables from query + query = retainVariables(query, aspect.keyVariable, variables); + + ResultSet results = QueryExecutionFactory.create(query, datasetModels).execSelect(); + while (results.hasNext()) { + QuerySolution result = results.next(); + Resource keyValue = result.getResource(aspect.getKeyVariableName()); + for (String variable : variables) { + if (result.contains(variable)) { + index.get(variable).computeIfAbsent(modifier.apply(result.get(variable)), k -> new HashSet<>()) + .add(keyValue); + } + } + } + return index; + } + + /** + * Removes all result variables from a {@link Query} except of variables given + * in {@code keyVariable} and {@code variables}. + */ + static Query retainVariables(Query query, Var keyVariable, Collection variables) { + // TODO HOTFIX for https://issues.apache.org/jira/browse/JENA-2335 + return query; + // Op op = new AlgebraGenerator().compile(query); + // op = new OpProject(op, + // query.getResultVars().stream().map(Var::alloc).filter(v -> + // v.equals(keyVariable) || + // variables.contains(v.getName())).collect(Collectors.toList())); + // return OpAsQuery.asQuery(op); + } + + /** + * Returns a hash index on multiple variables for {@link Resource Resources} of + * a given {@link Aspect}. Resources with unbound variables are omitted. + */ + public static Map> getResourceHashIndex(Aspect aspect, Resource dataset, + List variables, Model datasetModels) { + Map> index = new HashMap<>(); + + Query query = aspect.getPattern(dataset); + List resultVars = query.getResultVars(); + if (!resultVars.containsAll(variables)) { // skip if unknown variable + log.warn("Failed to create resources hash index of aspect {} and dataset {}: Unknown variable(s): {}", + aspect.getIri(), dataset, variables.stream().filter(resultVars::contains).toArray()); + return index; + } + + // remove not needed variables from query + query = retainVariables(query, aspect.keyVariable, variables); + + ResultSet results = QueryExecutionFactory.create(query, datasetModels).execSelect(); + while (results.hasNext()) { + QuerySolution result = results.next(); + Resource keyValue = result.getResource(aspect.getKeyVariableName()); + if (variables.stream().allMatch(result::contains)) { // skip resources with unbound variables + Values valueArray = new Values( + variables.stream().map(result::get).toArray(RDFNode[]::new)); + index.computeIfAbsent(valueArray, k -> new HashSet<>()).add(keyValue); + } + } + return index; + } + + /** + * Returns a new {@link Collection} instance containing the given resources that + * are covered by the pattern for the given dataset in the given {@link Model}. + */ + public Collection getResourcesInDataset(Collection resources, Resource dataset, Model model) { + if (!patternByDataset.containsKey(dataset)) { + return Collections.emptySet(); + } + Collection intersection = new ArrayList<>(); + + Query pattern = this.getPattern(dataset); + for (Resource resource : resources) { + Query query = SelectBuilder.rewrite(pattern.cloneQuery(), + Collections.singletonMap(this.keyVariable, resource.asNode())); + query.setQueryAskType(); + if (QueryExecutionFactory.create(query, model).execAsk()) { + intersection.add(resource); + } + } + return intersection; + } + + public Resource getIri() { + return this.iri; + } + + public Var getKeyVariable() { + return keyVariable; + } + + public String getKeyVariableName() { + return keyVariableName; + } + + /** + * Returns the pattern for the given dataset. + * + * @param dataset + * @return the pattern for the given dataset + * @throws NullPointerException if no pattern is defined for the given dataset + */ + public Query getPattern(Resource dataset) throws NullPointerException { + return Objects.requireNonNull(patternByDataset.get(dataset), + () -> String.format("Pattern of aspect %s for dataset %s not defined.", this.keyVariableName, dataset)); + } + + public Set getDatasets() { + return new HashSet<>(patternByDataset.keySet()); + } + + public boolean coversDataset(Resource dataset) { + return patternByDataset.containsKey(dataset); + } + + public boolean variableCoveredByDatasets(String variable, Resource firstDataset, Resource secondDataset) { + return variableCoveredByDataset(variable, firstDataset) && variableCoveredByDataset(variable, secondDataset); + } + + public boolean variableCoveredByDataset(String variable, Resource dataset) { + Collection coveredVariables = coveredVariablesByDataset.get(dataset); + return coveredVariables.contains(variable); + } + + public Aspect setPattern(Resource dataset, Query pattern) { + patternByDataset.put(dataset, pattern); + updateCoveredVariablesByDatasetForDataset(dataset); + return this; + } + + private void updateCoveredVariablesByDatasetForDataset(Resource dataset) { + coveredVariablesByDataset.put(dataset, getPattern(dataset).getResultVars()); + } + + public Path getVarPath(Resource dataset, String variable) { + return variablePathsByDataset.get(dataset).get(variable); + } + + public String getVarPathAsString(Resource dataset, String variable) { + return path2String(this.getVarPath(dataset, variable)); + } + + /** + * Determines the property paths from the key variable of this {@link Aspect} to + * other variables for all given dataset and adds them to the given + * {@link Model}. + * + * @param model the model to add the determined paths + */ + public void determineVarPaths(Model model) { + for (Resource dataset : patternByDataset.keySet()) { + try { + VarPathsExtractionVisitor visitor = new VarPathsExtractionVisitor(); + Query query = this.getPattern(dataset); + query.getQueryPattern().visit(visitor); + // get (blank-)node of the relevant aspect pattern + Resource aspectPattern = model.listResourcesWithProperty(AV.associatedDataset, dataset) + .filterKeep(r -> r.hasProperty(AV.ofAspect, this.iri)).next(); + this.variablePathsByDataset.put(dataset, visitor.getPaths(keyVariable)); + for (Entry variablePath : this.variablePathsByDataset.get(dataset).entrySet()) { + aspectPattern.addProperty(AV.hasVariablePath, model.createResource(AV.VariablePath)// + .addLiteral(AV.variableName, variablePath.getKey())// + .addProperty(AV.propertyPath, path2String(variablePath.getValue()))); + } + } catch (IllegalArgumentException e) { + log.warn(String.format( + "Failed to determine variables paths for aspect %s (key variable \"%s\") and dataset \"%s\".", + this.iri, this.keyVariableName, dataset), e); + } + } + } } diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Count.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Count.java index e8a0b5a..870dbbf 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Count.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Count.java @@ -26,8 +26,12 @@ public Count(Resource quantity, Resource unit) { super(quantity, unit); } + public void setZero(K key) { + values.put(key, 0L); + } + public void incrementByOrSetOne(K key) { - incrementByOrSet(key, 1); + incrementByOrSet(key, 1L); } public void incrementByOrSet(K key, long increment) { diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Measure.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Measure.java index 55b050e..840794f 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Measure.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/Measure.java @@ -18,13 +18,15 @@ package de.uni_jena.cs.fusion.abecto.measure; +import de.uni_jena.cs.fusion.abecto.Aspect; +import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.Resource; import java.util.HashMap; import java.util.Map; import java.util.Set; -abstract class Measure { +public abstract class Measure { final Map values = new HashMap<>(); final Resource quantity; @@ -35,6 +37,12 @@ public Measure(Resource quantity, Resource unit) { this.unit = unit; } + public static > void storeInModelForAllVariable(Map measuresByVariable, Aspect aspect, Map outputModelsMap) { + for (String variable : measuresByVariable.keySet()) { + measuresByVariable.get(variable).storeInModelWithVariable(aspect, variable, outputModelsMap); + } + } + public V get(K key) { return values.get(key); } @@ -67,4 +75,10 @@ public void set(K key, V value) { values.put(key, value); } + public void storeInModel(Aspect aspect, Map outputModelsMap) { + storeInModelWithVariable(aspect, null, outputModelsMap); + } + + abstract void storeInModelWithVariable(Aspect aspect, String variable, Map outputModelsMap); + } diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetCount.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetCount.java index 8458461..4c226d0 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetCount.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetCount.java @@ -20,8 +20,6 @@ import de.uni_jena.cs.fusion.abecto.Aspect; import de.uni_jena.cs.fusion.abecto.Metadata; -import de.uni_jena.cs.fusion.abecto.vocabulary.AV; -import de.uni_jena.cs.fusion.abecto.vocabulary.OM; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.Resource; @@ -43,13 +41,6 @@ public static Map mapOfCounts(Iterable keys, Resource return mapOfCounts; } - public void storeInModel(Aspect aspect, Map outputModelsMap) { - for (Resource dataset : keySet()) { - Metadata.addQualityMeasurement(quantity, get(dataset), unit, dataset, aspect.getIri(), - outputModelsMap.get(dataset)); - } - } - public void storeInModelWithVariable(Aspect aspect, String variable, Map outputModelsMap) { for (Resource dataset : keySet()) { Metadata.addQualityMeasurement(quantity, get(dataset), unit, dataset, variable, aspect.getIri(), outputModelsMap.get(dataset)); diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetPairCount.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetPairCount.java index 631a1a3..fe64e2d 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetPairCount.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetPairCount.java @@ -42,10 +42,6 @@ public static Map mapOfCounts(Iterable keys, Reso return mapOfCounts; } - public void storeInModel(Aspect aspect, Map outputModelsMap) { - storeInModelWithVariable(aspect, null, outputModelsMap); - } - public void storeInModelWithVariable(Aspect aspect, String variable, Map outputModelsMap) { for (ResourcePair pair : keySet()) { Metadata.addQualityMeasurement(quantity, get(pair), unit, pair.first, variable, pair.second, aspect.getIri(), outputModelsMap.get(pair.first)); diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetPairRatio.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetPairRatio.java deleted file mode 100644 index 517011a..0000000 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetPairRatio.java +++ /dev/null @@ -1,43 +0,0 @@ -/*- - * Copyright © 2019-2022 Heinz Nixdorf Chair for Distributed Information Systems, - * Friedrich Schiller University Jena (http://www.fusion.uni-jena.de/) - * Copyright © 2023-2024 Jan Martin Keil (jan-martin.keil@uni-jena.de) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. --*/ - -package de.uni_jena.cs.fusion.abecto.measure; - -import de.uni_jena.cs.fusion.abecto.Aspect; -import de.uni_jena.cs.fusion.abecto.Metadata; -import de.uni_jena.cs.fusion.abecto.ResourcePair; -import org.apache.jena.rdf.model.Model; -import org.apache.jena.rdf.model.Resource; - -import java.util.Map; - -public class PerDatasetPairRatio extends Ratio { - - public PerDatasetPairRatio(Resource quantity, Resource unit) { - super(quantity, unit); - } - - public void storeInModel(Aspect aspect, Map outputModelsMap) { - for (ResourcePair pair : keySet()) { - Metadata.addQualityMeasurement(quantity, get(pair), unit, - pair.first, pair.second, aspect.getIri(), outputModelsMap.get(pair.first)); - Metadata.addQualityMeasurement(quantity, get(pair), unit, - pair.second, pair.first, aspect.getIri(), outputModelsMap.get(pair.second)); - } - } -} diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetRatio.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetRatio.java index 5ad0816..47dbb90 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetRatio.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetRatio.java @@ -32,11 +32,22 @@ public PerDatasetRatio(Resource quantity, Resource unit) { super(quantity, unit); } + public static void storeInModelAsComparedToAllOtherResourcesForAllVariables(Map measuresByVariable, Aspect aspect, Map outputModelsMap) { + for (String variable : measuresByVariable.keySet()) { + measuresByVariable.get(variable).storeInModelAsComparedToAllOtherResourcesWithVariable(aspect, variable, outputModelsMap); + } + } + + @Override + void storeInModelWithVariable(Aspect aspect, String variable, Map outputModelsMap) { + throw new UnsupportedOperationException(); // TODO + } + public void storeInModelAsComparedToAllOtherResources(Aspect aspect, Map outputModelsMap) { - storeInModelWithVariableAsComparedToAllOtherResources(aspect, null, outputModelsMap); + storeInModelAsComparedToAllOtherResourcesWithVariable(aspect, null, outputModelsMap); } - public void storeInModelWithVariableAsComparedToAllOtherResources(Aspect aspect, String variable, Map outputModelsMap) { + public void storeInModelAsComparedToAllOtherResourcesWithVariable(Aspect aspect, String variable, Map outputModelsMap) { for (Resource dataset : values.keySet()) { Collection otherDatasets = new HashSet<>(values.keySet()); otherDatasets.remove(dataset); diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetTupelRatio.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetTupelRatio.java index c3efa10..4293b2e 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetTupelRatio.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/measure/PerDatasetTupelRatio.java @@ -45,27 +45,20 @@ public static Map mapOfRatios(Iterable keys, Res } public void setRatioOf(PerDatasetPairCount numerators, PerDatasetCount denominators) { - for (ResourcePair pair : numerators.keySet()) { - BigDecimal numerator = BigDecimal.valueOf(numerators.get(pair)); - setRatioForTupel(numerator, denominators, pair.first, pair.second); - setRatioForTupel(numerator, denominators, pair.second, pair.first); - } - } - - void setRatioForTupel(BigDecimal numerator, PerDatasetCount denominators, Resource asessedResource, Resource otherResource) { - if (denominators.contains(otherResource)) { - BigDecimal denominator = BigDecimal.valueOf(denominators.get(otherResource)); - if (!denominator.equals(BigDecimal.ZERO)) { - BigDecimal value = numerator.divide(denominator, SCALE, ROUNDING_MODE); - set(ResourceTupel.getTupel(asessedResource, otherResource), value); + for (ResourcePair datasetPair : numerators.keySet()) { + if (denominators.contains(datasetPair.first) && denominators.contains(datasetPair.second)) { + BigDecimal numerator = BigDecimal.valueOf(numerators.get(datasetPair)); + setRatioForTupel(numerator, denominators, datasetPair.first, datasetPair.second); + setRatioForTupel(numerator, denominators, datasetPair.second, datasetPair.first); } } } - public void storeInModel(Aspect aspect, Map outputModelsMap) { - for (ResourceTupel tupel : keySet()) { - Metadata.addQualityMeasurement(quantity, get(tupel), unit, - tupel.first, tupel.second, aspect.getIri(), outputModelsMap.get(tupel.first)); + void setRatioForTupel(BigDecimal numerator, PerDatasetCount denominators, Resource assessedDataset, Resource comparedDataset) { + BigDecimal denominator = BigDecimal.valueOf(denominators.get(comparedDataset)); + if (!denominator.equals(BigDecimal.ZERO)) { + BigDecimal value = numerator.divide(denominator, SCALE, ROUNDING_MODE); + set(ResourceTupel.getTupel(assessedDataset, comparedDataset), value); } } diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/ComparisonProcessor.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/ComparisonProcessor.java index 735a254..8c404b8 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/ComparisonProcessor.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/ComparisonProcessor.java @@ -26,19 +26,17 @@ import de.uni_jena.cs.fusion.abecto.measure.PerDatasetRatio; import de.uni_jena.cs.fusion.abecto.vocabulary.AV; import de.uni_jena.cs.fusion.abecto.vocabulary.OM; -import org.apache.jena.arq.querybuilder.SelectBuilder; import org.apache.jena.query.Query; import org.apache.jena.query.QueryExecutionFactory; -import org.apache.jena.query.QuerySolution; import org.apache.jena.query.ResultSet; import org.apache.jena.rdf.model.Model; -import org.apache.jena.rdf.model.RDFNode; import org.apache.jena.rdf.model.Resource; -import org.apache.jena.sparql.core.Var; import java.math.BigDecimal; import java.math.RoundingMode; -import java.util.*; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; import java.util.stream.Stream; public abstract class ComparisonProcessor

> extends Processor

{ @@ -92,87 +90,6 @@ public Stream getResourceKeys(Aspect aspect, Resource dataset) return Streams.stream(results).map(querySolution -> querySolution.getResource(keyVariableName)); } - /** - * Returns the values of the given {@link Resource} that are covered by the - * pattern of the given dataset in the given {@link Model}. If this aspect does - * not cover the given dataset or the model does not contain values for the - * given resource, {@code null} is returned. - */ - public Map> selectResourceValues(Resource resource, Resource dataset, - Aspect aspect, Collection variables) { - if (!aspect.coversDataset(dataset)) { - return Collections.emptyMap(); - } - - Model model = this.getInputPrimaryModelUnion(dataset); - Query pattern = aspect.getPattern(dataset); - Var keyVariable = aspect.getKeyVariable(); - - return this.selectResourceValues(resource, pattern, keyVariable, variables, model); - } - - /** - * Returns the values of the given {@link Resource Resources} that are covered - * by the pattern of the given dataset in the given {@link Model}. If this - * aspect does not cover the given dataset an empty result is returned. If the - * model does not contain any value for a given resource, the resource is mapped - * to {@code null}. - */ - public Map>> selectResourceValues(Collection resources, - Resource dataset, Aspect aspect, List variables) { - if (!aspect.coversDataset(dataset)) { - return Collections.emptyMap(); - } - - Model model = this.getInputPrimaryModelUnion(dataset); - Query pattern = aspect.getPattern(dataset); - Var keyVariable = aspect.getKeyVariable(); - - Map>> valuesByResource = new HashMap<>(); - - for (Resource resource : resources) { - Map> resourceValues = selectResourceValues(resource, pattern, keyVariable, variables, model); - if (resourceValues != null) { - valuesByResource.put(resource, resourceValues); - } - } - - return valuesByResource; - } - - /** - * Returns the values of the given {@link Resource} that are covered by the - * pattern of the given dataset in the given {@link Model}. If this aspect does - * not cover the given dataset or the model does not contain values for the - * given resource, {@code null} is returned. - */ - private Map> selectResourceValues(Resource resource, Query pattern, Var keyVariable, - Collection variables, Model model) { - Query query = SelectBuilder.rewrite(pattern.cloneQuery(), - Collections.singletonMap(keyVariable, resource.asNode())); - ResultSet results = QueryExecutionFactory.create(query, model).execSelect(); - - if (!results.hasNext()) { - return null; - } - - Map> values = new HashMap<>(); - for (String variable : variables) { - values.put(variable, new HashSet<>()); - } - while (results.hasNext()) { - QuerySolution result = results.next(); - for (String variable : variables) { - if (result.contains(variable)) { - RDFNode value = result.get(variable); - values.get(variable).add(value); - } - } - } - - return values; - } - Map getOutputMetaModels(Iterable datasets) { Map outputMetaModelByDataset = new HashMap<>(); for (Resource dataset : datasets) { diff --git a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java index 5afb24f..a61bc0a 100644 --- a/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java +++ b/abecto-core/src/main/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessor.java @@ -23,13 +23,12 @@ import java.util.stream.Collectors; import de.uni_jena.cs.fusion.abecto.*; -import de.uni_jena.cs.fusion.abecto.measure.PerDatasetCount; -import de.uni_jena.cs.fusion.abecto.measure.PerDatasetPairCount; -import de.uni_jena.cs.fusion.abecto.measure.PerDatasetRatio; -import de.uni_jena.cs.fusion.abecto.measure.PerDatasetTupelRatio; +import de.uni_jena.cs.fusion.abecto.measure.*; +import org.apache.jena.arq.querybuilder.SelectBuilder; import org.apache.jena.datatypes.RDFDatatype; import org.apache.jena.datatypes.xsd.XSDDateTime; import org.apache.jena.datatypes.xsd.impl.*; +import org.apache.jena.query.*; import org.apache.jena.rdf.model.Literal; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.RDFNode; @@ -37,6 +36,7 @@ import de.uni_jena.cs.fusion.abecto.vocabulary.AV; import de.uni_jena.cs.fusion.abecto.vocabulary.OM; +import org.apache.jena.sparql.core.Var; import org.apache.jena.sparql.expr.nodevalue.NodeFunctions; public class PropertyComparisonProcessor extends ComparisonProcessor { @@ -80,20 +80,22 @@ public class PropertyComparisonProcessor extends ComparisonProcessor absoluteCoverage; - Map relativeCoverage; + Map absoluteValueCoverage; + Map relativeValueCoverage; /** * Number of values in this dataset, per variable. */ - Map count; + Map nonDistinctValuesCount; /** * Number of distinct values in this dataset, per variable. Index: variable, affectedDataset */ - Map deduplicatedCount; - Map completeness; + Map distinctValuesCount; + Map valueCompleteness; Map> uncoveredResourcesByDataset = new HashMap<>(); - + Map>>> resourcesByNonDistinctValueByDatasetByVariable = new HashMap<>(); + Map>>> resourcesByDistinctValueByDatasetByVariable = new HashMap<>(); + Map> correspondingResourcesByDataset = new HashMap<>(); @Override public final void run() { @@ -102,39 +104,8 @@ public final void run() { initializeMeasures(); resetUncoveredResources(); - getCorrespondenceGroups().forEach(correspondingResources -> { - Map> correspondingResourcesByDataset = separateByDataset(correspondingResources); - removeFromUncoveredResources(correspondingResourcesByDataset); - // get values for all corresponding resources in all datasets - Map>>> valuesByVariableByResourceByDataset = new HashMap<>(); - for (Resource dataset : datasets) { - Set correspondingResourcesOfDataset = correspondingResourcesByDataset.get(dataset); - valuesByVariableByResourceByDataset.put(dataset, selectResourceValues(correspondingResourcesOfDataset, dataset, theAspect, variables)); - removeKnownWrongValues(valuesByVariableByResourceByDataset.get(dataset), dataset); - removeExcludedValues(valuesByVariableByResourceByDataset.get(dataset)); - // increment count and deduplicated count - for (String variable : variables) { - // get values of variable for all corresponding resources in dataset - var valuesOfCorrespondingResources = new ArrayList(); - valuesByVariableByResourceByDataset.get(dataset).values().stream() - .map(m -> m.getOrDefault(variable, Collections.emptySet())) - .forEach(valuesOfCorrespondingResources::addAll); - - measureCountAndDeduplicatedCount(dataset, variable, valuesOfCorrespondingResources); - } - } - - for (ResourcePair datasetPair : datasetPairsWithRepetition) { - for (String variable : variables) { - if (theAspect.getPattern(datasetPair.first).getResultVars().contains(variable) - && theAspect.getPattern(datasetPair.second).getResultVars().contains(variable)) { - calculateDeviationsAndOmissions(variable, datasetPair, valuesByVariableByResourceByDataset); - } - } - } - }); - - countAndDeduplicateValuesOfUncoveredResource(); + compareValuesOfCorrespondingResources(); + compareValuesOfNotCorrespondingResources(); calculateCompleteness(); calculateRelativeCoverage(); @@ -142,265 +113,404 @@ public final void run() { storeMeasures(); } - private void setAspect(Resource aspect) { - theAspect = this.getAspects().get(aspect); + protected void compareValuesOfCorrespondingResources() { + getCorrespondenceGroups().forEach(this::compareValuesOfCorrespondingResources); } - private void setAspectDatasets() { - datasets = theAspect.getDatasets(); - datasetPairsWithoutRepetition = ResourcePair.getPairsWithoutRepetitionOf(datasets); - datasetPairsWithRepetition = ResourcePair.getPairsWithRepetitionOf(datasets); - datasetTupels = ResourceTupel.getTupelsOf(datasets); - outputMetaModelByDataset = getOutputMetaModels(datasets); - } + protected void compareValuesOfCorrespondingResources(List correspondingResources) { + setCorrespondingResourcesByDataset(correspondingResources); + removeCorrespondingResourcesFromUncoveredResources(); + + loadNonDistinctValues(); + calculateDistinctValues(); - private void initializeMeasures() { - count = PerDatasetCount.mapOfCounts(variables, AV.count, OM.one); - deduplicatedCount = PerDatasetCount.mapOfCounts(variables, AV.deduplicatedCount, OM.one); - absoluteCoverage = PerDatasetPairCount.mapOfCounts(variables, AV.absoluteCoverage, OM.one); - relativeCoverage = PerDatasetTupelRatio.mapOfRatios(variables, AV.relativeCoverage, OM.one); - completeness = new HashMap<>(); + measureNonDistinctValuesCount(); + measureDistinctValuesCount(); + measureAbsoluteCoverage(); + reportDeviationsAndOmissions(); } - private void resetUncoveredResources() { - uncoveredResourcesByDataset.clear(); + protected void setCorrespondingResourcesByDataset(List correspondingResources) { for (Resource dataset : datasets) { - setResourcesOfDatasetAndAspectUncovered(dataset); + Set correspondingResourcesOfDataset = correspondingResourcesByDataset.get(dataset); + correspondingResourcesOfDataset.clear(); + correspondingResourcesOfDataset.addAll(correspondingResources); + correspondingResourcesOfDataset.retainAll(uncoveredResourcesByDataset.get(dataset)); } } - Map> separateByDataset(List resources) { - Map> resourcesByDataset = new HashMap<>(); - for (Resource dataset : datasets) { - Set resourcesOfDataset = new HashSet<>(resources); - resourcesOfDataset.retainAll(uncoveredResourcesByDataset.get(dataset)); - resourcesByDataset.put(dataset, resourcesOfDataset); + protected void loadNonDistinctValues() { + resetResourcesByNonDistinctValueByDatasetByVariable(); + + Set datasetsOfCorrespondingResources = correspondingResourcesByDataset.keySet(); + + for (Resource dataset : datasetsOfCorrespondingResources) { + Collection resourcesOfDataset = correspondingResourcesByDataset.get(dataset); + loadNonDistinctValuesOfDataset(dataset, resourcesOfDataset); } - return resourcesByDataset; } - void removeFromUncoveredResources(Map> coveredResourcesByDataset) { - for (Resource dataset : coveredResourcesByDataset.keySet()) { - Set uncoveredResourcesOfDataset = uncoveredResourcesByDataset.get(dataset); - Set coveredResourcesOfDataset = coveredResourcesByDataset.get(dataset); - uncoveredResourcesOfDataset.removeAll(coveredResourcesOfDataset); + protected void loadNonDistinctValuesOfDataset(Resource dataset, Collection resourcesOfDataset) { + for (Resource resource : resourcesOfDataset) { + Map> valuesByVariable = getValuesByVariable(dataset, resource); + for (String variable : variables) { + if (theAspect.variableCoveredByDataset(variable, dataset)) { + Iterable values = valuesByVariable.get(variable); + Map> resourcesByNonDistinctValue = resourcesByNonDistinctValueByDatasetByVariable.get(variable).get(dataset); + for (RDFNode value : values) { + Set resourcesOfValue = resourcesByNonDistinctValue.computeIfAbsent(value, k -> new HashSet<>()); + resourcesOfValue.add(resource); + } + } + } } } - /** - * Removes all values that are known wrong values. - */ - private void removeKnownWrongValues(Map>> valuesByVariableByResource, Resource dataset) { - for (Resource resource : valuesByVariableByResource.keySet()) { - removeKnownWrongValues(valuesByVariableByResource.get(resource), resource, dataset); + protected Query getQueryForResource(Resource dataset, Resource resource) { + Query aspectPatternOfDataset = theAspect.getPattern(dataset).cloneQuery(); + Var keyVariable = theAspect.getKeyVariable(); + return SelectBuilder.rewrite(aspectPatternOfDataset, Collections.singletonMap(keyVariable, resource.asNode())); + } + + protected void resetResourcesByNonDistinctValueByDatasetByVariable() { + for (Map>> resourcesByNonDistinctValueByDataset : resourcesByNonDistinctValueByDatasetByVariable.values()) { + for (Map> resourcesByNonDistinctValue : resourcesByNonDistinctValueByDataset.values()) { + resourcesByNonDistinctValue.clear(); + } } } - /** - * Removes all values that are known wrong values. - */ - private void removeKnownWrongValues(Map> valuesByVariable, Resource resource, Resource dataset) { - for (String variable : valuesByVariable.keySet()) { - Set values = valuesByVariable.get(variable); - values.removeIf(value -> this.isWrongValue(resource, variable, value, dataset)); + protected void calculateDistinctValues() { + for (String variable : variables) { + for (Resource dataset : datasets) { + if (theAspect.variableCoveredByDataset(variable, dataset)) { + calculateDistinctValuesForVariableAndDataset(variable, dataset); + } + } } } - protected boolean isWrongValue(Resource affectedResource, String affectedVariableName, RDFNode affectedValue, - Resource affectedDataset) { - return Metadata.isWrongValue(affectedResource, affectedVariableName, affectedValue, aspect, - this.getInputMetaModelUnion(affectedDataset)); + protected void calculateDistinctValuesForVariableAndDataset(String variable, Resource dataset) { + Map> resourcesByNonDistinctValue = resourcesByNonDistinctValueByDatasetByVariable.get(variable).get(dataset); + Map> resourcesByDistinctValue = resourcesByDistinctValueByDatasetByVariable.get(variable).get(dataset); + resourcesByDistinctValue.clear(); + for (RDFNode nonDistinctValue : resourcesByNonDistinctValue.keySet()) { + Set resourcesOfNonDistinctValue = resourcesByNonDistinctValue.get(nonDistinctValue); + Set resourcesOfDistinctValue = getResourcesOfEquivalentDistinctValue(nonDistinctValue, resourcesByDistinctValue); + resourcesOfDistinctValue.addAll(resourcesOfNonDistinctValue); + } } - private void removeExcludedValues(Map>> valuesByVariableByResource) { - valuesByVariableByResource.forEach((resource, valuesByVariable) -> valuesByVariable.forEach((variable, values) -> values.removeIf(this::isExcludedValue))); + Set getResourcesOfEquivalentDistinctValue(RDFNode nonDistinctValue, Map> resourcesByDistinctValue) { + Set resourcesOfDistinctValue; + for (RDFNode distinctValue : resourcesByDistinctValue.keySet()) { + if (equivalentValues(nonDistinctValue, distinctValue)) { + resourcesOfDistinctValue = resourcesByDistinctValue.get(distinctValue); + return resourcesOfDistinctValue; + } + } + resourcesOfDistinctValue = new HashSet<>(); + resourcesByDistinctValue.put(nonDistinctValue, resourcesOfDistinctValue); + return resourcesOfDistinctValue; } - private void setResourcesOfDatasetAndAspectUncovered(Resource dataset) { - Set distinctResources = getResourceKeys(theAspect, dataset).collect(Collectors.toSet()); - uncoveredResourcesByDataset.put(dataset, distinctResources); + protected void measureNonDistinctValuesCount() { + for (String variable : variables) { + Map>> resourcesByNonDistinctValueByDataset = resourcesByNonDistinctValueByDatasetByVariable.get(variable); + PerDatasetCount nonDistinctValuesCountOfVariable = nonDistinctValuesCount.get(variable); + for (Resource dataset : datasets) { + if (theAspect.variableCoveredByDataset(variable, dataset)) { + nonDistinctValuesCountOfVariable.setZero(dataset); + Map> resourcesByNonDistinctValue = resourcesByNonDistinctValueByDataset.get(dataset); + for (RDFNode nonDistinctValue : resourcesByNonDistinctValue.keySet()) { + Set resourcesOfNonDistinctValue = resourcesByNonDistinctValue.get(nonDistinctValue); + nonDistinctValuesCountOfVariable.incrementByOrSet(dataset, resourcesOfNonDistinctValue.size()); + } + } + } + } } - private void countAndDeduplicateValuesOfUncoveredResource() { - for (Resource dataset : datasets) { - Set uncoveredResources = uncoveredResourcesByDataset.get(dataset); - for (Resource uncoveredResource : uncoveredResources) { - // TODO refactor + protected void measureDistinctValuesCount() { + for (String variable : variables) { + Map>> resourcesByDistinctValueByDataset = resourcesByDistinctValueByDatasetByVariable.get(variable); + for (Resource dataset : datasets) { + if (theAspect.variableCoveredByDataset(variable, dataset)) { + Map> resourcesByDistinctValue = resourcesByDistinctValueByDataset.get(dataset); + distinctValuesCount.get(variable).incrementByOrSet(dataset, resourcesByDistinctValue.size()); + } + } + } + } - // get resource values - Map> valuesByVariable = selectResourceValues(uncoveredResource, dataset, theAspect, variables); + protected void measureAbsoluteCoverage() { + for (String variable : variables) { + Map>> resourcesByDistinctValueByDataset = resourcesByDistinctValueByDatasetByVariable.get(variable); + PerDatasetPairCount absoluteCoverageForVariable = absoluteValueCoverage.get(variable); + for (ResourcePair datasetPair : datasetPairsWithoutRepetition) { + absoluteCoverageForVariable.setZero(datasetPair); + if (theAspect.variableCoveredByDatasets(variable, datasetPair.first, datasetPair.second)) { + Set distinctValuesOfFirstDataset = resourcesByDistinctValueByDataset.get(datasetPair.first).keySet(); + Set distinctValuesOfSecondDataset = resourcesByDistinctValueByDataset.get(datasetPair.second).keySet(); + for (RDFNode valueOfFirstDataset : distinctValuesOfFirstDataset) { + for (RDFNode valueOfSecondDataset : distinctValuesOfSecondDataset) { + if (equivalentValues(valueOfFirstDataset, valueOfSecondDataset)) { + absoluteCoverageForVariable.incrementByOrSetOne(datasetPair); + break; + } + } + } + } + } + } + } - // removeExcludedValues - valuesByVariable.forEach((k, v) -> v.removeIf(this::isExcludedValue)); + protected void reportDeviationsAndOmissions() { + for (String variable : variables) { + Map>> resourcesByNonDistinctValueByDataset = resourcesByNonDistinctValueByDatasetByVariable.get(variable); + for (ResourcePair datasetPair : datasetPairsWithRepetition) { + if (theAspect.variableCoveredByDatasets(variable, datasetPair.first, datasetPair.second)) { + Map> resourceByNonDistinctValuesOfFirstDataset = resourcesByNonDistinctValueByDataset.get(datasetPair.first); + Map> resourceByNonDistinctValuesOfSecondDataset = resourcesByNonDistinctValueByDataset.get(datasetPair.second); + for (Resource firstResource : correspondingResourcesByDataset.get(datasetPair.first)) { + for (Resource secondResource : correspondingResourcesByDataset.get(datasetPair.second)) { + Set uncoveredValuesOfFirstResource = + getUncoveredValuesOfResource(firstResource, secondResource, resourceByNonDistinctValuesOfFirstDataset, resourceByNonDistinctValuesOfSecondDataset); + Set uncoveredValuesOfSecondResource = + getUncoveredValuesOfResource(secondResource, firstResource, resourceByNonDistinctValuesOfSecondDataset, resourceByNonDistinctValuesOfFirstDataset); + + // deviation: a pair of resources with each having a value not present in the + // other resource + // omission: a pair of resources with one having a value not present in the other, + // but not vice versa + + // report missing not matching values + if (uncoveredValuesOfFirstResource.isEmpty()) { + for (RDFNode value2 : uncoveredValuesOfSecondResource) { + Metadata.addValuesOmission(firstResource, variable, datasetPair.second, secondResource, value2, aspect, + getOutputMetaModel(datasetPair.first)); + } + } else if (uncoveredValuesOfSecondResource.isEmpty()) { + for (RDFNode value1 : uncoveredValuesOfFirstResource) { + Metadata.addValuesOmission(secondResource, variable, datasetPair.first, firstResource, value1, aspect, + getOutputMetaModel(datasetPair.second)); + } + } else { + // report pairs of deviating values + for (RDFNode value1 : uncoveredValuesOfFirstResource) { + for (RDFNode value2 : uncoveredValuesOfSecondResource) { + Metadata.addDeviation(firstResource.asResource(), variable, value1, datasetPair.second, + secondResource.asResource(), value2, aspect, getOutputMetaModel(datasetPair.first)); + Metadata.addDeviation(secondResource.asResource(), variable, value2, datasetPair.first, + firstResource.asResource(), value1, aspect, getOutputMetaModel(datasetPair.second)); + } + } + } + } + } + } + } + } + } - for (String variable : valuesByVariable.keySet()) { - Collection valuesOfVariable = valuesByVariable.get(variable); - measureCountAndDeduplicatedCount(dataset, variable, valuesOfVariable); + protected Set getUncoveredValuesOfResource(Resource resource, Resource comparedResource, Map> resourcesByValues, Map> comparedResourcesByValues) { + Set uncoveredValuesOfResource = new HashSet<>(); + Set values = resourcesByValues.keySet(); + Set comparedValues = comparedResourcesByValues.keySet(); + iterationOfComparedValues: + for (RDFNode value : values) { + if (isValueOfResource(value, resource, resourcesByValues)) { + for (RDFNode comparedValue : comparedValues) { + if (isValueOfResource(comparedValue, comparedResource, comparedResourcesByValues)) { + if (equivalentValues(value, comparedValue)) { + continue iterationOfComparedValues; + } + } } + uncoveredValuesOfResource.add(value); } } + return uncoveredValuesOfResource; } - void measureCountAndDeduplicatedCount(Resource dataset, String variable, Collection valuesOfVariable) { - long valuesCountWithDuplicates = valuesOfVariable.size(); - long valuesCountWithoutDuplicates = deduplicate(valuesOfVariable).size(); - count.get(variable).incrementByOrSet(dataset, valuesCountWithDuplicates); - deduplicatedCount.get(variable).incrementByOrSet(dataset, valuesCountWithoutDuplicates); + boolean isValueOfResource(RDFNode value, Resource resource, Map> resourcesByValues) { + return resourcesByValues.get(value).contains(resource); } - private void calculateCompleteness() { - for (String variable : variables) { - // TODO add value exclusion filter description to measurement description - completeness.put(variable, calculateCompleteness(datasetPairsWithoutRepetition, absoluteCoverage.get(variable), deduplicatedCount.get(variable))); + protected void setAspect(Resource aspect) { + theAspect = this.getAspects().get(aspect); + } + + protected void setAspectDatasets() { + datasets = theAspect.getDatasets(); + datasetPairsWithoutRepetition = ResourcePair.getPairsWithoutRepetitionOf(datasets); + datasetPairsWithRepetition = ResourcePair.getPairsWithRepetitionOf(datasets); + datasetTupels = ResourceTupel.getTupelsOf(datasets); + outputMetaModelByDataset = getOutputMetaModels(datasets); + initializeCorrespondingResourceByDataset(); + resourcesByNonDistinctValueByDatasetByVariable = getMapOfResourcesByValueByDatasetByVariable(); + resourcesByDistinctValueByDatasetByVariable = getMapOfResourcesByValueByDatasetByVariable(); + } + + protected void initializeCorrespondingResourceByDataset() { + correspondingResourcesByDataset = new HashMap<>(); + for (Resource dataset : datasets) { + correspondingResourcesByDataset.put(dataset, new HashSet<>()); } } - private void calculateRelativeCoverage() { + protected Map>>> getMapOfResourcesByValueByDatasetByVariable() { + Map>>> map = new HashMap<>(); for (String variable : variables) { - PerDatasetTupelRatio relativeCoverageOfVariable = relativeCoverage.get(variable); - PerDatasetPairCount absoluteCoverageOfVariable = absoluteCoverage.get(variable); - PerDatasetCount deduplicatedCountOfVariable = deduplicatedCount.get(variable); - relativeCoverageOfVariable.setRatioOf(absoluteCoverageOfVariable, deduplicatedCountOfVariable); + Map>> resourcesByValueByDataset = new HashMap<>(); + map.put(variable, resourcesByValueByDataset); + for (Resource dataset : datasets) { + if (theAspect.variableCoveredByDataset(variable, dataset)) { + resourcesByValueByDataset.put(dataset, new HashMap<>()); + } + } } + return map; } - private void storeMeasures() { - storeCount(); - storeDeduplicatedCount(); - storeAbsoluteCoverage(); - storeRelativeCoverage(); - storeCompleteness(); + protected void initializeMeasures() { + nonDistinctValuesCount = PerDatasetCount.mapOfCounts(variables, AV.count, OM.one); + distinctValuesCount = PerDatasetCount.mapOfCounts(variables, AV.deduplicatedCount, OM.one); + absoluteValueCoverage = PerDatasetPairCount.mapOfCounts(variables, AV.absoluteCoverage, OM.one); + relativeValueCoverage = PerDatasetTupelRatio.mapOfRatios(variables, AV.relativeCoverage, OM.one); + valueCompleteness = new HashMap<>(); } - private void storeCount() { - for (String variable : variables) { - // TODO add value exclusion filter description to measurement description - count.get(variable).storeInModelWithVariable(theAspect, variable, outputMetaModelByDataset); + protected void resetUncoveredResources() { + uncoveredResourcesByDataset.clear(); + for (Resource dataset : datasets) { + setResourcesOfDatasetAndAspectUncovered(dataset); } } - private void storeDeduplicatedCount() { - for (String variable : variables) { - // TODO add value exclusion filter description to measurement description - deduplicatedCount.get(variable).storeInModelWithVariable(theAspect, variable, outputMetaModelByDataset); + protected void removeCorrespondingResourcesFromUncoveredResources() { + for (Resource dataset : correspondingResourcesByDataset.keySet()) { + Set uncoveredResourcesOfDataset = uncoveredResourcesByDataset.get(dataset); + Set coveredResourcesOfDataset = correspondingResourcesByDataset.get(dataset); + uncoveredResourcesOfDataset.removeAll(coveredResourcesOfDataset); } } - private void storeAbsoluteCoverage() { - for (String variable : variables) { - // TODO add value exclusion filter description to measurement description - absoluteCoverage.get(variable).storeInModelWithVariable(theAspect, variable, outputMetaModelByDataset); - } + protected boolean isKnownWrongValue(Resource affectedResource, String affectedVariableName, RDFNode affectedValue, + Resource affectedDataset) { + return Metadata.isWrongValue(affectedResource, affectedVariableName, affectedValue, aspect, + this.getInputMetaModelUnion(affectedDataset)); } - private void storeRelativeCoverage() { - for (String variable : variables) { - // TODO add value exclusion filter description to measurement description - relativeCoverage.get(variable).storeInModelWithVariable(theAspect, variable, outputMetaModelByDataset); + protected void setResourcesOfDatasetAndAspectUncovered(Resource dataset) { + Set distinctResources = getResourceKeys(theAspect, dataset).collect(Collectors.toSet()); + uncoveredResourcesByDataset.put(dataset, distinctResources); + } + + protected void compareValuesOfNotCorrespondingResources() { + for (Resource dataset : datasets) { + Set uncoveredResources = uncoveredResourcesByDataset.get(dataset); + for (Resource uncoveredResource : uncoveredResources) { + Map> valuesByVariable = getValuesByVariable(dataset, uncoveredResource); + for (String variable : valuesByVariable.keySet()) { + Set valuesOfVariable = valuesByVariable.get(variable); + measureCountAndDeduplicatedCount(dataset, variable, valuesOfVariable); + } + } } } - private void storeCompleteness() { - for (String variable : variables) { - // TODO add value exclusion filter description to measurement description - completeness.get(variable).storeInModelWithVariableAsComparedToAllOtherResources(theAspect, variable, outputMetaModelByDataset); + protected Map> getValuesByVariable(Resource dataset, Resource resource) { + if (!theAspect.coversDataset(dataset)) { + return Collections.emptyMap(); } + Query query = getQueryForResource(dataset, resource); + Model model = getInputPrimaryModelUnion(dataset); + Map> valuesByVariable = getValuesByVariable(model, query); + // TODO do not ignore wrong values for measures -> remove check + valuesByVariable.forEach((variable, values) -> values.removeIf(value -> isKnownWrongValue(resource, variable, value, dataset))); + return valuesByVariable; } - /** - * Note: Not the most efficient way to do this, but there is no - * {@link Comparator} available to use {@link TreeMap#TreeMap(Comparator)}. - */ - private void mapResources(String variable, Map> resourcesByMappedValues, - Map>> valuesByVariableByResource) { - for (Resource resource : valuesByVariableByResource.keySet()) { - for (RDFNode value : valuesByVariableByResource.get(resource).getOrDefault(variable, - Collections.emptySet())) { - for (RDFNode valueKey : resourcesByMappedValues.keySet()) { - if (equivalentValues(value, valueKey)) { - Set valuesSet = resourcesByMappedValues.get(valueKey); - // add values to existing set - valuesSet.add(resource); - // map value to same set, so equivalent values will share one set - resourcesByMappedValues.putIfAbsent(value, valuesSet); - break; + protected Map> getValuesByVariable(Model model, Query query) { + try (QueryExecution queryExecution = QueryExecutionFactory.create(query, model)) { + ResultSet results = queryExecution.execSelect(); + List relevantVariables = getRelevantVariables(results); + Map> valuesByVariable = new HashMap<>(); + for (String variable : relevantVariables) { + valuesByVariable.put(variable, new HashSet<>()); + } + while (results.hasNext()) { + QuerySolution result = results.next(); + for (String variable : relevantVariables) { + if (result.contains(variable)) { + RDFNode value = result.get(variable); + if (!isExcludedValue(value)) { + valuesByVariable.get(variable).add(value); + } } } - // no equivalent value in map - resourcesByMappedValues.computeIfAbsent(value, v -> new HashSet<>()).add(resource); } + return valuesByVariable; } } - /** - * Compares the values of one variable from two corresponding resources in two - * datasets and stores encountered deviations and issues in both according - * outputMetaModels (derived by {@link #getOutputMetaModel(Resource)}). Either - * the corresponding resources or the datasets might be equal, but not both at - * once. - * - * @param variable Name of the compared variable - */ - public void calculateDeviationsAndOmissions(String variable, ResourcePair datasetPair, - Map>>> valuesByVariableByResourceByDataset) { - - Map>> valuesByVariableByResource1 = valuesByVariableByResourceByDataset.get(datasetPair.first); - Map>> valuesByVariableByResource2 = valuesByVariableByResourceByDataset.get(datasetPair.second); + List getRelevantVariables(ResultSet results) { + List relevantVariables = results.getResultVars(); + relevantVariables.retainAll(variables); + return relevantVariables; + } - // create common value-resource look-up - Map> resourcesByMappedValues = new HashMap<>(); - mapResources(variable, resourcesByMappedValues, valuesByVariableByResource1); - mapResources(variable, resourcesByMappedValues, valuesByVariableByResource2); + protected void measureCountAndDeduplicatedCount(Resource dataset, String variable, Set valuesOfVariable) { + long valuesCountWithDuplicates = valuesOfVariable.size(); + long valuesCountWithoutDuplicates = countDistinctValues(valuesOfVariable); + nonDistinctValuesCount.get(variable).incrementByOrSet(dataset, valuesCountWithDuplicates); + distinctValuesCount.get(variable).incrementByOrSet(dataset, valuesCountWithoutDuplicates); + } + protected int countDistinctValues(Set nonDistinctValues) { + List distinctValues = new ArrayList<>(nonDistinctValues.size()); + iterationOfNonDistinctValues: + for (RDFNode nonDistinctValue : nonDistinctValues) { + for (RDFNode distinctValue : distinctValues) { + if (equivalentValues(nonDistinctValue, distinctValue)) { + continue iterationOfNonDistinctValues; + } + } + distinctValues.add(nonDistinctValue); + } + return distinctValues.size(); + } - // update measurements - if (!datasetPair.first.equals(datasetPair.second)) {// do not measure for first == second - // TODO test, that no absolute coverage exist for dataset compared with itself - int pairwiseOverlap = getPairwiseOverlap(valuesByVariableByResource1.keySet(), valuesByVariableByResource2.keySet(), resourcesByMappedValues); - absoluteCoverage.get(variable).incrementByOrSet(datasetPair, pairwiseOverlap); + protected void calculateCompleteness() { + for (String variable : variables) { + PerDatasetRatio valueCompletenessOfVariable = calculateCompleteness(datasetPairsWithoutRepetition, absoluteValueCoverage.get(variable), distinctValuesCount.get(variable)); + valueCompleteness.put(variable, valueCompletenessOfVariable); } + } - // deviation: a pair of resources with each having a value not present in the - // other resource - // omission: a pair of resources with one having a value not present in the other, - // but not vice versa - - for (Resource resource1 : valuesByVariableByResource1.keySet()) { - var values1 = valuesByVariableByResource1.get(resource1).getOrDefault(variable, Collections.emptySet()); - for (Resource resource2 : valuesByVariableByResource2.keySet()) { - var values2 = valuesByVariableByResource2.get(resource2).getOrDefault(variable, Collections.emptySet()); - - var notMatchingValues1 = values1.stream().filter(value1 -> !resourcesByMappedValues - .getOrDefault(value1, Collections.emptySet()).contains(resource2)).toList(); - var notMatchingValues2 = values2.stream().filter(value2 -> !resourcesByMappedValues - .getOrDefault(value2, Collections.emptySet()).contains(resource1)).toList(); - - // report missing not matching values - if (notMatchingValues1.isEmpty()) { - for (RDFNode value2 : notMatchingValues2) { - Metadata.addValuesOmission(resource1, variable, datasetPair.second, resource2, value2, this.aspect, - this.getOutputMetaModel(datasetPair.first)); - } - } else if (notMatchingValues2.isEmpty()) { - for (RDFNode value1 : notMatchingValues1) { - Metadata.addValuesOmission(resource2, variable, datasetPair.first, resource1, value1, this.aspect, - this.getOutputMetaModel(datasetPair.second)); - } - } else { - // report pairs of deviating values - for (RDFNode value1 : notMatchingValues1) { - for (RDFNode value2 : notMatchingValues2) { - Metadata.addDeviation(resource1.asResource(), variable, value1, datasetPair.second, - resource2.asResource(), value2, this.aspect, this.getOutputMetaModel(datasetPair.first)); - Metadata.addDeviation(resource2.asResource(), variable, value2, datasetPair.first, - resource1.asResource(), value1, this.aspect, this.getOutputMetaModel(datasetPair.second)); - } - } - } - } + protected void calculateRelativeCoverage() { + for (String variable : variables) { + PerDatasetTupelRatio relativeCoverageOfVariable = relativeValueCoverage.get(variable); + PerDatasetPairCount absoluteCoverageOfVariable = absoluteValueCoverage.get(variable); + PerDatasetCount deduplicatedCountOfVariable = distinctValuesCount.get(variable); + relativeCoverageOfVariable.setRatioOf(absoluteCoverageOfVariable, deduplicatedCountOfVariable); } } + protected void storeMeasures() { + // TODO add value exclusion filter description to measurement description + Measure.storeInModelForAllVariable(nonDistinctValuesCount, theAspect, outputMetaModelByDataset); + // TODO add value exclusion filter description to measurement description + Measure.storeInModelForAllVariable(distinctValuesCount, theAspect, outputMetaModelByDataset); + // TODO add value exclusion filter description to measurement description + Measure.storeInModelForAllVariable(absoluteValueCoverage, theAspect, outputMetaModelByDataset); + // TODO add value exclusion filter description to measurement description + Measure.storeInModelForAllVariable(relativeValueCoverage, theAspect, outputMetaModelByDataset); + // TODO add value exclusion filter description to measurement description + PerDatasetRatio.storeInModelAsComparedToAllOtherResourcesForAllVariables(valueCompleteness, theAspect, outputMetaModelByDataset); + } + /** * Checks if two values are equivalent. * @@ -408,7 +518,7 @@ public void calculateDeviationsAndOmissions(String variable, ResourcePair datase * @param value2 the second value to compare * @return {@code true}, if the values are equivalent, otherwise {@code false} */ - public boolean equivalentValues(RDFNode value1, RDFNode value2) { + protected boolean equivalentValues(RDFNode value1, RDFNode value2) { if (value1.isResource() && value2.isResource()) { return correspond(value1.asResource(), value2.asResource()); } else if (value1.isLiteral() && value2.isLiteral()) { @@ -537,7 +647,7 @@ public boolean equivalentValues(RDFNode value1, RDFNode value2) { * @param value the value to check * @return {@code true}, if the value should not be used, otherwise {@code false} */ - public boolean isExcludedValue(RDFNode value) { + protected boolean isExcludedValue(RDFNode value) { if (!value.isLiteral() || !(value.asLiteral().getDatatype() instanceof XSDBaseStringType) && !(value.asLiteral().getDatatype() instanceof RDFLangString)) { @@ -548,40 +658,4 @@ public boolean isExcludedValue(RDFNode value) { .noneMatch(languageFilterPattern -> NodeFunctions.langMatches(langStr, languageFilterPattern)); } } - - /** - * Returns a duplicate free list of the given values by removing all values equivalent to an earlier value. - */ - private List deduplicate(Iterable values) { - ArrayList distinctValues = new ArrayList<>(); - for (RDFNode value : values) { - if (distinctValues.stream().noneMatch(v -> equivalentValues(v, value))) { - distinctValues.add(value); - } - } - return distinctValues; - } - - private Set distinctByIdentity(Collection items) { - Set distinctItems = Collections.newSetFromMap(new IdentityHashMap<>()); - distinctItems.addAll(items); - return distinctItems; - } - - private int getPairwiseOverlap(Collection resources1, - Collection resources2, Map> resourcesByMappedValues) { - int pairwiseOverlap = 0; - // use set of resource sets for mapping values - // NOTE: equivalent values use the same set, so get distinct set instances - for (Set resourceSet : distinctByIdentity(resourcesByMappedValues.values())) { - if (resources1.stream().noneMatch(resourceSet::contains)) { - continue; - } - if (resources2.stream().noneMatch(resourceSet::contains)) { - continue; - } - pairwiseOverlap++; - } - return pairwiseOverlap; - } } diff --git a/abecto-core/src/test/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessorTest.java b/abecto-core/src/test/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessorTest.java index 3e08844..053048f 100644 --- a/abecto-core/src/test/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessorTest.java +++ b/abecto-core/src/test/java/de/uni_jena/cs/fusion/abecto/processor/PropertyComparisonProcessorTest.java @@ -157,6 +157,7 @@ void assertDeviationOneDirection(Aspect aspect, Collection values1, Col dataset(1), dataset(2), aspect(1), outputMetaModels[0], outputMetaModels[1]); } + private void assertMeasurements(BigDecimal expectedCount1, BigDecimal expectedCount2, BigDecimal expectedDeduplicatedCount1, BigDecimal expectedDeduplicatedCount2, BigDecimal expectedAbsoluteCoverage1, BigDecimal expectedAbsoluteCoverage2,