From 2c5d2866073c899803962bb6017a73baf49194ed Mon Sep 17 00:00:00 2001 From: Luigi Asprino Date: Fri, 3 Nov 2023 17:25:05 +0100 Subject: [PATCH] Fix string casting (see #416) Include QGram distance (see #394) --- pom.xml | 7 + sparql-anything-engine/pom.xml | 6 + .../github/sparqlanything/engine/FacadeX.java | 148 +++++++----------- .../engine/functions/FunctionsUtils.java | 35 +++++ .../SimilarityScoreFunctionFactory.java | 50 ++++++ .../StringDistanceFunctionFactory.java | 27 +--- .../engine/test/FunctionsTest.java | 13 +- .../github/sparqlanything/it/SandboxTest.java | 8 + 8 files changed, 176 insertions(+), 118 deletions(-) create mode 100644 sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/FunctionsUtils.java create mode 100644 sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/SimilarityScoreFunctionFactory.java diff --git a/pom.xml b/pom.xml index 377642b3..9f8910d8 100644 --- a/pom.xml +++ b/pom.xml @@ -406,6 +406,13 @@ + + info.debatty + java-string-similarity + 2.0.0 + + + diff --git a/sparql-anything-engine/pom.xml b/sparql-anything-engine/pom.xml index cc0f94ac..28ee3e99 100644 --- a/sparql-anything-engine/pom.xml +++ b/sparql-anything-engine/pom.xml @@ -149,6 +149,12 @@ test + + + info.debatty + java-string-similarity + + diff --git a/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/FacadeX.java b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/FacadeX.java index ec9f2679..49d63856 100644 --- a/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/FacadeX.java +++ b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/FacadeX.java @@ -16,6 +16,7 @@ package io.github.sparqlanything.engine; +import info.debatty.java.stringsimilarity.QGram; import io.github.sparqlanything.engine.functions.*; import io.github.sparqlanything.engine.functions.reflection.ReflectionFunctionFactory; import io.github.sparqlanything.model.Triplifier; @@ -39,53 +40,32 @@ public final class FacadeX { public final static OpExecutorFactory ExecutorFactory = FacadeXOpExecutor::new; public final static TriplifierRegister Registry = TriplifierRegister.getInstance(); + public static final String ANY_SLOT_URI = Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "anySlot"; private static final Logger log = LoggerFactory.getLogger(FacadeX.class); - public static final String ANY_SLOT_URI = Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "anySlot"; static { try { log.trace("Registering isFacadeXExtension function"); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "isFacadeXExtension", - IsFacadeXExtension.class); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "isFacadeXExtension", IsFacadeXExtension.class); enablingMagicProperties(); enablingFunctions(); log.trace("Registering standard triplifiers"); - Registry.registerTriplifier("io.github.sparqlanything.bib.BibtexTriplifier", - new String[]{"bib", "bibtex"}, new String[]{"application/x-bibtex"}); - Registry.registerTriplifier("io.github.sparqlanything.xml.XMLTriplifier", new String[]{"xml"}, - new String[]{"application/xml", "text/xml"}); - Registry.registerTriplifier("io.github.sparqlanything.csv.CSVTriplifier", new String[]{"csv", "tsv", "tab"}, - new String[]{"text/csv", "text/tab-separated-values"}); - Registry.registerTriplifier("io.github.sparqlanything.html.HTMLTriplifier", new String[]{"html"}, - new String[]{"text/html"}); - Registry.registerTriplifier("io.github.sparqlanything.text.TextTriplifier", new String[]{"txt"}, - new String[]{"text/plain"}); - Registry.registerTriplifier("io.github.sparqlanything.markdown.MARKDOWNTriplifier", new String[]{"md"}, - new String[]{"text/markdown", "text/x-markdown"}); - Registry.registerTriplifier("io.github.sparqlanything.docs.DocxTriplifier", new String[]{"docx"}, - new String[]{"application/vnd.openxmlformats-officedocument.wordprocessingml.document"}); - Registry.registerTriplifier("io.github.sparqlanything.zip.TarTriplifier", new String[]{"tar"}, - new String[]{"application/x-tar"}); - Registry.registerTriplifier("io.github.sparqlanything.zip.ZipTriplifier", new String[]{"zip"}, - new String[]{"application/zip"}); - Registry.registerTriplifier("io.github.sparqlanything.binary.BinaryTriplifier", - new String[]{"bin", "dat"}, new String[]{"application/octet-stream"}); - Registry.registerTriplifier("io.github.sparqlanything.json.JSONTriplifier", new String[]{"json"}, - new String[]{"application/json", "application/problem+json"}); - Registry.registerTriplifier("io.github.sparqlanything.yaml.YAMLTriplifier", new String[]{"yaml"}, - new String[]{"application/yaml", "text/yaml", "x-text/yaml"}); - Registry.registerTriplifier("io.github.sparqlanything.spreadsheet.SpreadsheetTriplifier", - new String[]{"xls", "xlsx"}, new String[]{"application/vnd.ms-excel", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}); - Registry.registerTriplifier(RDFTriplifier.class.getCanonicalName(), - new String[]{"rdf", "ttl", "nt", "jsonld", "owl", "trig", "nq", "trix", "trdf"}, - new String[]{"application/rdf+thrift", "application/trix+xml", "application/n-quads", "text/trig", - "application/owl+xml", "text/turtle", "application/rdf+xml", "application/n-triples", - "application/ld+json"}); - Registry.registerTriplifier("io.github.sparqlanything.binary.BinaryTriplifier", - new String[]{"png", "jpeg", "jpg", "bmp", "tiff", "tif", "ico"}, - new String[]{"image/png", "image/jpeg", "image/bmp", "image/tiff", "image/vnd.microsoft.icon"}); + Registry.registerTriplifier("io.github.sparqlanything.bib.BibtexTriplifier", new String[]{"bib", "bibtex"}, new String[]{"application/x-bibtex"}); + Registry.registerTriplifier("io.github.sparqlanything.xml.XMLTriplifier", new String[]{"xml"}, new String[]{"application/xml", "text/xml"}); + Registry.registerTriplifier("io.github.sparqlanything.csv.CSVTriplifier", new String[]{"csv", "tsv", "tab"}, new String[]{"text/csv", "text/tab-separated-values"}); + Registry.registerTriplifier("io.github.sparqlanything.html.HTMLTriplifier", new String[]{"html"}, new String[]{"text/html"}); + Registry.registerTriplifier("io.github.sparqlanything.text.TextTriplifier", new String[]{"txt"}, new String[]{"text/plain"}); + Registry.registerTriplifier("io.github.sparqlanything.markdown.MARKDOWNTriplifier", new String[]{"md"}, new String[]{"text/markdown", "text/x-markdown"}); + Registry.registerTriplifier("io.github.sparqlanything.docs.DocxTriplifier", new String[]{"docx"}, new String[]{"application/vnd.openxmlformats-officedocument.wordprocessingml.document"}); + Registry.registerTriplifier("io.github.sparqlanything.zip.TarTriplifier", new String[]{"tar"}, new String[]{"application/x-tar"}); + Registry.registerTriplifier("io.github.sparqlanything.zip.ZipTriplifier", new String[]{"zip"}, new String[]{"application/zip"}); + Registry.registerTriplifier("io.github.sparqlanything.binary.BinaryTriplifier", new String[]{"bin", "dat"}, new String[]{"application/octet-stream"}); + Registry.registerTriplifier("io.github.sparqlanything.json.JSONTriplifier", new String[]{"json"}, new String[]{"application/json", "application/problem+json"}); + Registry.registerTriplifier("io.github.sparqlanything.yaml.YAMLTriplifier", new String[]{"yaml"}, new String[]{"application/yaml", "text/yaml", "x-text/yaml"}); + Registry.registerTriplifier("io.github.sparqlanything.spreadsheet.SpreadsheetTriplifier", new String[]{"xls", "xlsx"}, new String[]{"application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}); + Registry.registerTriplifier(RDFTriplifier.class.getCanonicalName(), new String[]{"rdf", "ttl", "nt", "jsonld", "owl", "trig", "nq", "trix", "trdf"}, new String[]{"application/rdf+thrift", "application/trix+xml", "application/n-quads", "text/trig", "application/owl+xml", "text/turtle", "application/rdf+xml", "application/n-triples", "application/ld+json"}); + Registry.registerTriplifier("io.github.sparqlanything.binary.BinaryTriplifier", new String[]{"png", "jpeg", "jpg", "bmp", "tiff", "tif", "ico"}, new String[]{"image/png", "image/jpeg", "image/bmp", "image/tiff", "image/vnd.microsoft.icon"}); } catch (TriplifierRegisterException e) { throw new RuntimeException(e); @@ -104,9 +84,9 @@ public static void enablingMagicProperties() { final PropertyFunctionRegistry reg = PropertyFunctionRegistry.chooseRegistry(ARQ.getContext()); //log.trace("Registering {} magic property", ANY_SLOT_URI); reg.put(ANY_SLOT_URI, p); - if(log.isTraceEnabled()){ + if (log.isTraceEnabled()) { Iterator i = reg.keys(); - while(i.hasNext()){ + while (i.hasNext()) { log.trace("Registering magic property: {}", i.next()); } } @@ -129,58 +109,35 @@ public static void enablingFunctions() { FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "isContainerMembershipProperty", IsContainerMembershipProperty.class); log.trace("Enabling String functions"); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.trim", - ReflectionFunctionFactory.get().makeFunction(String.class, "trim")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.substring", - ReflectionFunctionFactory.get().makeFunction(String.class, "substring")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.indexOf", - ReflectionFunctionFactory.get().makeFunction(String.class, "indexOf")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.startsWith", - ReflectionFunctionFactory.get().makeFunction(String.class, "startsWith")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.endsWith", - ReflectionFunctionFactory.get().makeFunction(String.class, "endsWith")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.replace", - ReflectionFunctionFactory.get().makeFunction(String.class, "replace")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.strip", - ReflectionFunctionFactory.get().makeFunction(String.class, "strip")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.stripLeading", - ReflectionFunctionFactory.get().makeFunction(String.class, "stripLeading")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.stripTrailing", - ReflectionFunctionFactory.get().makeFunction(String.class, "stripTrailing")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.trim", ReflectionFunctionFactory.get().makeFunction(String.class, "trim")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.substring", ReflectionFunctionFactory.get().makeFunction(String.class, "substring")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.indexOf", ReflectionFunctionFactory.get().makeFunction(String.class, "indexOf")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.startsWith", ReflectionFunctionFactory.get().makeFunction(String.class, "startsWith")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.endsWith", ReflectionFunctionFactory.get().makeFunction(String.class, "endsWith")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.replace", ReflectionFunctionFactory.get().makeFunction(String.class, "replace")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.strip", ReflectionFunctionFactory.get().makeFunction(String.class, "strip")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.stripLeading", ReflectionFunctionFactory.get().makeFunction(String.class, "stripLeading")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.stripTrailing", ReflectionFunctionFactory.get().makeFunction(String.class, "stripTrailing")); FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.removeTags", RemoveTags.class); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.lastIndexOf", - ReflectionFunctionFactory.get().makeFunction(String.class, "lastIndexOf")); - - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.md2Hex", - ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "md2Hex")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.md5Hex", - ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "md5Hex")); - - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.sha1Hex", - ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "sha1Hex")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.sha256Hex", - ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "sha256Hex")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.sha384Hex", - ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "sha384Hex")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.sha512Hex", - ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "sha512Hex")); - - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.capitalize", - ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "capitalize")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.capitalizeFully", - ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "capitalizeFully")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.initials", - ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "initials")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.swapCase", - ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "swapCase")); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.uncapitalize", - ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "uncapitalize")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.lastIndexOf", ReflectionFunctionFactory.get().makeFunction(String.class, "lastIndexOf")); + + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.md2Hex", ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "md2Hex")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.md5Hex", ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "md5Hex")); + + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.sha1Hex", ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "sha1Hex")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.sha256Hex", ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "sha256Hex")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.sha384Hex", ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "sha384Hex")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "DigestUtils.sha512Hex", ReflectionFunctionFactory.get().makeFunction(DigestUtils.class, "sha512Hex")); + + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.capitalize", ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "capitalize")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.capitalizeFully", ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "capitalizeFully")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.initials", ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "initials")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.swapCase", ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "swapCase")); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "WordUtils.uncapitalize", ReflectionFunctionFactory.get().makeFunction(WordUtils.class, "uncapitalize")); try { - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.toLowerCase", - ReflectionFunctionFactory.get().makeFunction(String.class.getMethod("toLowerCase"))); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.toUpperCase", - ReflectionFunctionFactory.get().makeFunction(String.class.getMethod("toUpperCase"))); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.toLowerCase", ReflectionFunctionFactory.get().makeFunction(String.class.getMethod("toLowerCase"))); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "String.toUpperCase", ReflectionFunctionFactory.get().makeFunction(String.class.getMethod("toUpperCase"))); } catch (NoSuchMethodException e) { throw new RuntimeException(e); } @@ -192,12 +149,13 @@ public static void enablingFunctions() { log.error("", e); } - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "LevenshteinDistance",new StringDistanceFunctionFactory<>(new LevenshteinDistance()) ); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "CosineDistance",new StringDistanceFunctionFactory<>(new CosineDistance()) ); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "JaccardDistance",new StringDistanceFunctionFactory<>(new JaccardDistance()) ); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "JaroWinklerDistance",new StringDistanceFunctionFactory<>(new JaroWinklerDistance()) ); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "LongestCommonSubsequenceDistance",new StringDistanceFunctionFactory<>(new LongestCommonSubsequenceDistance()) ); - FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "HammingDistance",new StringDistanceFunctionFactory<>(new HammingDistance()) ); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "LevenshteinDistance", new SimilarityScoreFunctionFactory<>(new LevenshteinDistance())); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "CosineDistance", new SimilarityScoreFunctionFactory<>(new CosineDistance())); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "JaccardDistance", new SimilarityScoreFunctionFactory<>(new JaccardDistance())); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "JaroWinklerDistance", new SimilarityScoreFunctionFactory<>(new JaroWinklerDistance())); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "LongestCommonSubsequenceDistance", new SimilarityScoreFunctionFactory<>(new LongestCommonSubsequenceDistance())); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "HammingDistance", new SimilarityScoreFunctionFactory<>(new HammingDistance())); + FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "QGramDistance", new StringDistanceFunctionFactory(new QGram())); log.trace("Enabling function `serial`"); FunctionRegistry.get().put(Triplifier.FACADE_X_CONST_NAMESPACE_IRI + "serial", Serial.class); diff --git a/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/FunctionsUtils.java b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/FunctionsUtils.java new file mode 100644 index 00000000..c4233f6f --- /dev/null +++ b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/FunctionsUtils.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2023 SPARQL Anything Contributors @ http://github.com/sparql-anything + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.sparqlanything.engine.functions; + +import org.apache.jena.sparql.expr.ExprEvalException; +import org.apache.jena.sparql.expr.NodeValue; + +public abstract class FunctionsUtils { + + + public static String nodeValueAsString(NodeValue nodeValue) { + if (nodeValue.isLiteral()) { + return nodeValue.getString(); + } else if (nodeValue.isIRI()) { + return nodeValue.asNode().getURI(); + } + + throw new ExprEvalException("Argument must be literal or IRI"); + } + +} diff --git a/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/SimilarityScoreFunctionFactory.java b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/SimilarityScoreFunctionFactory.java new file mode 100644 index 00000000..eb18e071 --- /dev/null +++ b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/SimilarityScoreFunctionFactory.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023 SPARQL Anything Contributors @ http://github.com/sparql-anything + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.sparqlanything.engine.functions; + +import org.apache.commons.text.similarity.SimilarityScore; +import org.apache.jena.sparql.expr.NodeValue; +import org.apache.jena.sparql.function.Function; +import org.apache.jena.sparql.function.FunctionBase2; +import org.apache.jena.sparql.function.FunctionFactory; + +public class SimilarityScoreFunctionFactory implements FunctionFactory { + + private final SimilarityScore similarityScore; + + public SimilarityScoreFunctionFactory(SimilarityScore similarityScore){ + super(); + this.similarityScore = similarityScore; + } + + + @Override + public Function create(String s) { + return new FunctionBase2() { + @Override + public NodeValue exec(NodeValue nodeValue, NodeValue nodeValue1) { + T result = similarityScore.apply(FunctionsUtils.nodeValueAsString(nodeValue),FunctionsUtils.nodeValueAsString(nodeValue1)); + if(result instanceof Integer){ + return NodeValue.makeInteger((Integer)result); + } else if(result instanceof Double){ + return NodeValue.makeDouble((Double)result); + } + return NodeValue.nvNaN; + } + }; + } +} diff --git a/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/StringDistanceFunctionFactory.java b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/StringDistanceFunctionFactory.java index 67fac61e..851ab1d5 100644 --- a/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/StringDistanceFunctionFactory.java +++ b/sparql-anything-engine/src/main/java/io/github/sparqlanything/engine/functions/StringDistanceFunctionFactory.java @@ -16,28 +16,19 @@ package io.github.sparqlanything.engine.functions; -import org.apache.commons.text.similarity.SimilarityScore; -import org.apache.jena.sparql.expr.ExprEvalException; +import info.debatty.java.stringsimilarity.interfaces.StringDistance; import org.apache.jena.sparql.expr.NodeValue; import org.apache.jena.sparql.function.Function; import org.apache.jena.sparql.function.FunctionBase2; import org.apache.jena.sparql.function.FunctionFactory; -public class StringDistanceFunctionFactory implements FunctionFactory { +public class StringDistanceFunctionFactory implements FunctionFactory { - private final SimilarityScore similarityScore; + private final StringDistance similarityScore; - private static String nodeValueAsString(NodeValue nodeValue) { - if (nodeValue.isLiteral()) { - return nodeValue.toString(); - } else if (nodeValue.isIRI()) { - return nodeValue.asNode().getURI(); - } - throw new ExprEvalException("Argument must be literal or IRI"); - } - - public StringDistanceFunctionFactory(SimilarityScore similarityScore){ + public StringDistanceFunctionFactory(StringDistance similarityScore){ + super(); this.similarityScore = similarityScore; } @@ -47,13 +38,7 @@ public Function create(String s) { return new FunctionBase2() { @Override public NodeValue exec(NodeValue nodeValue, NodeValue nodeValue1) { - T result = similarityScore.apply(nodeValueAsString(nodeValue),nodeValueAsString(nodeValue1)); - if(result instanceof Integer){ - return NodeValue.makeInteger((Integer)result); - } else if(result instanceof Double){ - return NodeValue.makeDouble((Double)result); - } - return NodeValue.nvNaN; + return NodeValue.makeDouble(similarityScore.distance(FunctionsUtils.nodeValueAsString(nodeValue),FunctionsUtils.nodeValueAsString(nodeValue1))); } }; } diff --git a/sparql-anything-engine/src/test/java/io/github/sparqlanything/engine/test/FunctionsTest.java b/sparql-anything-engine/src/test/java/io/github/sparqlanything/engine/test/FunctionsTest.java index 0e3b2d8b..25f3d507 100644 --- a/sparql-anything-engine/src/test/java/io/github/sparqlanything/engine/test/FunctionsTest.java +++ b/sparql-anything-engine/src/test/java/io/github/sparqlanything/engine/test/FunctionsTest.java @@ -65,6 +65,15 @@ public void levenshteinDistance() { Assert.assertEquals(2, dist); } + @Test + public void qgramDistance() { + String q = "PREFIX fx: SELECT ?result WHERE { BIND (fx:QGramDistance(\"ABCD\", \"ABCE\") AS ?result) } "; + ResultSet result = execute(q); + Assert.assertTrue(result.hasNext()); + double dist = result.next().get("result").asLiteral().getDouble(); + Assert.assertEquals(2.0, dist, 0.0); + } + @Test public void levenshteinDistanceURI() { String q = "PREFIX fx: SELECT ?result WHERE { BIND (fx:LevenshteinDistance(, ) AS ?result) } "; @@ -89,7 +98,7 @@ public void jaccardDistance() { ResultSet result = execute(q); Assert.assertTrue(result.hasNext()); double dist = result.next().get("result").asLiteral().getDouble(); - Assert.assertEquals(0.4, dist, 0.01); + Assert.assertEquals(0.5, dist, 0.0); } @Test @@ -98,7 +107,7 @@ public void jaroWinklerDistance() { ResultSet result = execute(q); Assert.assertTrue(result.hasNext()); double dist = result.next().get("result").asLiteral().getDouble(); - Assert.assertEquals(0.24, dist, 0.01); + Assert.assertEquals(0.44, dist, 0.01); } @Test public void longestCommonSubsequenceDistance() { diff --git a/sparql-anything-it/src/test/java/io/github/sparqlanything/it/SandboxTest.java b/sparql-anything-it/src/test/java/io/github/sparqlanything/it/SandboxTest.java index 01de1b8c..b97fddea 100644 --- a/sparql-anything-it/src/test/java/io/github/sparqlanything/it/SandboxTest.java +++ b/sparql-anything-it/src/test/java/io/github/sparqlanything/it/SandboxTest.java @@ -16,6 +16,7 @@ package io.github.sparqlanything.it; +import info.debatty.java.stringsimilarity.QGram; import org.apache.jena.graph.Graph; import org.apache.jena.graph.NodeFactory; import org.apache.jena.graph.Triple; @@ -35,6 +36,13 @@ public class SandboxTest { + @Ignore + @Test + public void m(){ + QGram d = new QGram(); + System.out.println(d.distance("ABCD", "ABCE")); + } + @Ignore @Test public void model(){