diff --git a/.gitignore b/.gitignore index 3b10967..47e7904 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ project/plugins/project/ .worksheet *.iml .idea + +scalastyle-output.xml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..9374038 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,10 @@ +language: scala +sudo: false +cache: + directories: + - $HOME/.m2 +scala: + - 2.11.11 +script: + - mvn scalastyle:check + - mvn test \ No newline at end of file diff --git a/README.md b/README.md index cf2804e..c2b847f 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # SANSA Inference Layer [![Maven Central](https://maven-badges.herokuapp.com/maven-central/net.sansa-stack/sansa-inference-parent_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/net.sansa-stack/sansa-inference-parent_2.11) -[![Build Status](https://ci.aksw.org/jenkins/job/SANSA%20Inference%20Layer/job/develop/badge/icon)](https://ci.aksw.org/jenkins/job/SANSA%20Inference%20Layer/job/develop/) +[![Build Status](https://travis-ci.com/SANSA-Stack/SANSA-Inference.svg?branch=develop)](https://travis-ci.com/SANSA-Stack/SANSA-Inference) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Twitter](https://img.shields.io/twitter/follow/SANSA_Stack.svg?style=social)](https://twitter.com/SANSA_Stack) @@ -17,14 +17,15 @@ - [Setup](#setup) - [Prerequisites](#prerequisites) - [From source](#from-source) - - [Using Maven pre-build artifacts](#) + - [Using Maven pre-build artifacts](#using-maven-pre-build-artifacts) - [Using SBT](#using-SBT) - [Usage](#usage) - [Example](#example) - - [Supported Reasoning Profiles](#) + - [Supported Reasoning Profiles](#supported-reasoning-profiles) - [RDFS](#rdfs) - [RDFS Simple](#rdfs-simple) - [OWL Horst](#owl-horst) + - [How to Contribute](#how-to-contribute) ## Structure @@ -216,3 +217,7 @@ OWL Horst is a fragment of OWL and was proposed by Herman ter Horst [1] defining [1] Herman J. ter Horst: *Completeness, decidability and complexity of entailment for RDF Schema and a semantic extension involving the OWL vocabulary.* J. Web Sem. 3(2-3): 79-115 (2005) + +## How to Contribute +We always welcome new contributors to the project! Please see [our contribution guide](http://sansa-stack.net/contributing-to-sansa/) for more details on how to get started contributing to SANSA. + diff --git a/pom.xml b/pom.xml index 99a389d..7cd5a03 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 net.sansa-stack sansa-inference-parent_2.11 - 0.3.0 + 0.4.0 pom Inference API - Parent @@ -66,12 +66,12 @@ UTF-8 UTF-8 - 2.11.11 + 2.11.12 2.11 - 2.2.1 - 1.3.2 - 3.5.0 - 0.3.0 + 2.3.1 + 1.5.0 + 3.7.0 + 0.4.0 ${sansa.stack.version} ${sansa.stack.version} ${sansa.stack.version} @@ -81,7 +81,8 @@ 512m 512m AKSW - 5.1.3 + 5.1.5 + ${project.basedir}/scalastyle-config.xml @@ -93,8 +94,17 @@ ${project.groupId} - sansa-rdf-spark-core + sansa-rdf-spark_${scala.binary.version} ${sansa.rdf.version} + + + + net.jpountz.lz4 + lz4 + + ${project.groupId} @@ -148,6 +158,16 @@ spark-sql_${scala.binary.version} ${spark.version} + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + + + org.apache.spark + spark-streaming-kafka-0-10_${scala.binary.version} + ${spark.version} + @@ -177,6 +197,16 @@ jena-arq ${jena.version} + + org.apache.jena + jena-tdb + ${jena.version} + + + org.apache.jena + jena-cmds + ${jena.version} + @@ -202,24 +232,29 @@ - com.assembla.scala-incubator + org.scala-graph graph-core_${scala.binary.version} - 1.10.0 + 1.12.5 - com.assembla.scala-incubator + org.scala-graph graph-dot_${scala.binary.version} - 1.9.0 + 1.11.5 org.jgrapht jgrapht-core - 1.1.0 + 1.2.0 + + + org.jgrapht + jgrapht-io + 1.2.0 org.jgrapht jgrapht-ext - 1.1.0 + 1.2.0 org.gephi @@ -231,7 +266,7 @@ org.apache.calcite calcite-core - 1.13.0 + 1.16.0 @@ -250,13 +285,13 @@ org.specs2 specs2-core_${scala.binary.version} - 4.0.2 + 4.2.0 test org.specs2 specs2-junit_${scala.binary.version} - 4.0.2 + 4.2.0 test @@ -264,7 +299,7 @@ com.typesafe.scala-logging scala-logging_${scala.binary.version} - 3.7.2 + 3.9.0 @@ -278,7 +313,7 @@ com.chuusai shapeless_${scala.binary.version} - 2.3.2 + 2.3.3 @@ -292,7 +327,7 @@ com.typesafe config - 1.3.2 + 1.3.3 @@ -520,30 +555,33 @@ com.versioneye versioneye-maven-plugin - - - - - - - - - - - - - - - - - - - - - - - - + + + + org.scalastyle + scalastyle-maven-plugin + 1.0.0 + + false + true + true + false + ${project.basedir}/src/main/scala + ${project.basedir}/src/test/scala + + ${scalastyle.config.path} + ${project.basedir}/scalastyle-output.xml + UTF-8 + + + + + check + + + + + @@ -668,7 +706,8 @@ - ossrh + + release ossrh @@ -786,5 +825,18 @@ + + + + root-dir + + + ${project.basedir}/../../scalastyle-config.xml + + + + ${project.basedir}/../scalastyle-config.xml + + diff --git a/sansa-inference-common/pom.xml b/sansa-inference-common/pom.xml index cc789a9..60a4610 100644 --- a/sansa-inference-common/pom.xml +++ b/sansa-inference-common/pom.xml @@ -4,12 +4,12 @@ sansa-inference-parent_2.11 net.sansa-stack - 0.3.0 + 0.4.0 ../pom.xml net.sansa-stack sansa-inference-common_${scala.binary.version} - 0.3.0 + 0.4.0 Inference API - Common A set of common objects used in the Inference API @@ -31,21 +31,19 @@ org.apache.jena jena-tdb - 3.5.0 org.apache.jena jena-cmds - 3.5.0 - com.assembla.scala-incubator + org.scala-graph graph-core_${scala.binary.version} - com.assembla.scala-incubator + org.scala-graph graph-dot_${scala.binary.version} @@ -56,6 +54,10 @@ org.jgrapht jgrapht-ext + + org.jgrapht + jgrapht-io + org.gephi gephi-toolkit @@ -65,6 +67,7 @@ google-collections + compile @@ -84,6 +87,11 @@ 3.5.0 + + + com.github.scopt + scopt_${scala.binary.version} + diff --git a/sansa-inference-common/src/main/resources/log4j.properties b/sansa-inference-common/src/main/resources/log4j.properties index 0caae7a..dae125a 100644 --- a/sansa-inference-common/src/main/resources/log4j.properties +++ b/sansa-inference-common/src/main/resources/log4j.properties @@ -1,5 +1,5 @@ # Root logger option -log4j.rootLogger=INFO, stdout +log4j.rootLogger=DEBUG, stdout # Direct log messages to a log file log4j.appender.file=org.apache.log4j.RollingFileAppender @@ -20,3 +20,5 @@ log4j.logger.akka.remote.Remoting=ERROR log4j.logger.org.apache.hadoop=ERROR log4j.logger.org.apache.calcite=ERROR + +log4j.logger.scalax.collection.connectivity.GraphComponents=OFF diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/JenaOps.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/JenaOps.scala index f7b5b47..ca33a96 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/JenaOps.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/JenaOps.scala @@ -1,10 +1,10 @@ package net.sansa_stack.inference.data +import scala.collection.JavaConverters._ + import org.apache.jena.datatypes.{BaseDatatype, RDFDatatype, TypeMapper} import org.apache.jena.graph.{Graph => JenaGraph, Node => JenaNode, Triple => JenaTriple, _} -import org.apache.jena.rdf.model.{Literal => JenaLiteral, Seq => _} - -import scala.collection.JavaConverters._ +import org.apache.jena.rdf.model.{Seq => _} class JenaOps extends RDFOps[Jena] { @@ -33,10 +33,12 @@ class JenaOps extends RDFOps[Jena] { val s = t.getSubject val p = t.getPredicate val o = t.getObject - if (p.isInstanceOf[Jena#URI]) - (s, p.asInstanceOf[Jena#URI], o) - else - throw new RuntimeException("fromTriple: predicate " + p.toString + " must be a URI") + p match { + case uri: Node_URI => + (s, uri, o) + case _ => + throw new RuntimeException("fromTriple: predicate " + p.toString + " must be a URI") + } } // node @@ -52,10 +54,11 @@ class JenaOps extends RDFOps[Jena] { def makeUri(iriStr: String): Jena#URI = { NodeFactory.createURI(iriStr).asInstanceOf[Node_URI] } def fromUri(node: Jena#URI): String = - if (node.isURI) + if (node.isURI) { node.getURI - else + } else { throw new RuntimeException("fromUri: " + node.toString() + " must be a URI") + } // bnode @@ -67,17 +70,18 @@ class JenaOps extends RDFOps[Jena] { } def fromBNode(bn: Jena#BNode): String = - if (bn.isBlank) + if (bn.isBlank) { bn.getBlankNodeId.getLabelString - else + } else { throw new RuntimeException("fromBNode: " + bn.toString + " must be a BNode") + } // literal // TODO the javadoc doesn't say if this is thread safe lazy val mapper = TypeMapper.getInstance - def jenaDatatype(datatype: Jena#URI) = { + private def jenaDatatype(datatype: Jena#URI) = { val iriString = fromUri(datatype) val typ = mapper.getTypeByName(iriString) if (typ == null) { @@ -94,10 +98,11 @@ class JenaOps extends RDFOps[Jena] { val __rdfLangStringURI: Jena#URI = makeUri("http://www.w3.org/1999/02/22-rdf-syntax-ns#langString") def makeLiteral(lexicalForm: String, datatype: Jena#URI): Jena#Literal = - if (datatype == __xsdStringURI) + if (datatype == __xsdStringURI) { NodeFactory.createLiteral(lexicalForm, null, null).asInstanceOf[Node_Literal] - else + } else { NodeFactory.createLiteral(lexicalForm, null, jenaDatatype(datatype)).asInstanceOf[Node_Literal] + } def makeLangTaggedLiteral(lexicalForm: String, lang: Jena#Lang): Jena#Literal = NodeFactory.createLiteral(lexicalForm, fromLang(lang), null).asInstanceOf[Node_Literal] @@ -105,9 +110,9 @@ class JenaOps extends RDFOps[Jena] { // lang - def makeLang(langString: String) = langString + def makeLang(langString: String): String = langString - def fromLang(lang: Jena#Lang) = lang + def fromLang(lang: Jena#Lang): String = lang diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/RDF.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/RDF.scala index c520929..244fdb9 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/RDF.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/RDF.scala @@ -39,4 +39,4 @@ trait RDF { // types for the graph traversal API type NodeMatch type NodeAny <: NodeMatch -} \ No newline at end of file +} diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/RDFTuple.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/RDFTuple.scala index 4be2f51..e5db778 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/RDFTuple.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/RDFTuple.scala @@ -8,6 +8,7 @@ package net.sansa_stack.inference.data * @author Lorenz Buehmann */ case class RDFTuple(s: String, o: String) extends Product2[String, String] { - override def _1: String = s - override def _2: String = o - } \ No newline at end of file + override def _1: String = s + + override def _2: String = o +} diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/SimpleRDFOps.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/SimpleRDFOps.scala index 0db5882..2d78419 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/SimpleRDFOps.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/data/SimpleRDFOps.scala @@ -17,10 +17,12 @@ class SimpleRDFOps extends RDFOps[SimpleRDF] { val s = t.s val p = t.p val o = t.o - if (p.isInstanceOf[SimpleRDF#URI]) - (s, p.asInstanceOf[SimpleRDF#URI], o) - else - throw new RuntimeException("fromTriple: predicate " + p.toString + " must be a URI") + p match { + case uri: String => + (s, uri, o) + case _ => + throw new RuntimeException("fromTriple: predicate " + p.toString + " must be a URI") + } } // node diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/RuleDependencyGraphAnalyzer.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/RuleDependencyGraphAnalyzer.scala index 1a43d47..e1f6552 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/RuleDependencyGraphAnalyzer.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/RuleDependencyGraphAnalyzer.scala @@ -137,8 +137,8 @@ object RuleDependencyGraphAnalyzer extends Logging{ def main(args: Array[String]) { // we re-use the JENA API for parsing rules val filenames = List( -// "rules/rdfs-simple.rules" - "rules/owl_horst.rules" + "rules/rdfs-simple.rules" +// "rules/owl_horst.rules" // "rules/owl_rl.rules" ) @@ -154,7 +154,7 @@ object RuleDependencyGraphAnalyzer extends Logging{ // print each rule as graph rules.foreach { r => - val g = RuleUtils.asGraph(r).export(new File(graphDir, r.getName + ".graphml").toString) + RuleUtils.asGraph(r).export(new File(graphDir, r.getName + ".graphml").toString) } // generate graph diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/RuleDependencyGraphGenerator.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/RuleDependencyGraphGenerator.scala index 8f16335..ee4de84 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/RuleDependencyGraphGenerator.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/RuleDependencyGraphGenerator.scala @@ -5,6 +5,7 @@ import java.util.stream.Collectors import scala.collection import scala.collection.JavaConverters._ import scala.language.{existentials, implicitConversions} + import scalax.collection.GraphPredef._ import scalax.collection.GraphTraversal.Parameters import scalax.collection._ @@ -13,7 +14,6 @@ import scalax.collection.edge._ import scalax.collection.mutable.DefaultGraphImpl import scalax.collection.GraphPredef._ import scalax.collection.GraphEdge._ - import org.apache.jena.graph.{Node, NodeFactory} import org.apache.jena.reasoner.TriplePattern import org.apache.jena.reasoner.rulesys.Rule @@ -259,12 +259,12 @@ object RuleDependencyGraphGenerator extends Logging { pairsOfRules :+= (cycle.last, cycle(0)) // map to list of edges - val edges: Buffer[graph.EdgeT] = pairsOfRules.map(e => { + val edges: Buffer[graph.EdgeT] = pairsOfRules.flatMap(e => { val node1 = graph get e._1 val node2 = graph get e._2 node1.outgoing.filter(_.target == node2) - }).flatten + }) debug("Edges: " + edges.mkString(", ")) // map to edge labels, i.e. the predicates diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/minimizer/MinimizationRule.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/minimizer/MinimizationRule.scala index 8b15776..362a8df 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/minimizer/MinimizationRule.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/minimizer/MinimizationRule.scala @@ -18,4 +18,4 @@ abstract class MinimizationRule extends Logging { def apply(graph: RuleDependencyGraph): RuleDependencyGraph -} \ No newline at end of file +} diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/minimizer/RuleDependencyGraphMinimizer.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/minimizer/RuleDependencyGraphMinimizer.scala index c93103a..4d50e6d 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/minimizer/RuleDependencyGraphMinimizer.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/minimizer/RuleDependencyGraphMinimizer.scala @@ -1,10 +1,12 @@ package net.sansa_stack.inference.rules.minimizer import scala.collection.mutable.{ArrayBuffer, Buffer} + import scalax.collection.Graph import scalax.collection.edge.LDiEdge import scala.collection.JavaConverters._ -import scalax.collection.GraphTraversal.Parameters +import scala.collection.mutable +import scalax.collection.GraphTraversal.Parameters import org.apache.jena.graph.{Node, NodeFactory} import org.apache.jena.reasoner.TriplePattern import org.apache.jena.reasoner.rulesys.Rule @@ -16,7 +18,6 @@ import net.sansa_stack.inference.rules.RuleDependencyGraphGenerator.{asString, d import net.sansa_stack.inference.utils.{GraphUtils, RuleUtils} import net.sansa_stack.inference.utils.graph.LabeledEdge import net.sansa_stack.inference.utils.RuleUtils._ - import scalax.collection.GraphTraversal.Parameters import scalax.collection._ import scalax.collection.edge.Implicits._ @@ -294,11 +295,13 @@ abstract class RuleDependencyGraphMinimizer extends MinimizationRuleExecutor { // debug(cycles.asScala.mkString(",")) // cycles that contain the current node - val cyclesWithNode: Buffer[Buffer[Rule]] = allCycles.asScala.filter(cycle => cycle.contains(node.value)).map(cycle => cycle.asScala) + val cyclesWithNode: mutable.Buffer[mutable.Buffer[Rule]] = allCycles.asScala + .filter(cycle => cycle.contains(node.value)) + .map(cycle => cycle.asScala) debug("Cycles: " + cyclesWithNode.map(c => c.map(r => r.getName)).mkString(",")) // cycles that use the same property - val cyclesWithNodeSameProp: Map[Node, scala.List[Buffer[graph.EdgeT]]] = cyclesWithNode.map(cycle => { + val cyclesWithNodeSameProp: Map[Node, scala.List[mutable.Buffer[graph.EdgeT]]] = cyclesWithNode.map(cycle => { debug("Cycle: " + cycle.map(r => r.getName).mkString(", ")) @@ -307,12 +310,12 @@ abstract class RuleDependencyGraphMinimizer extends MinimizationRuleExecutor { pairsOfRules :+= (cycle.last, cycle(0)) // map to list of edges - val edges: Buffer[graph.EdgeT] = pairsOfRules.map(e => { + val edges: mutable.Buffer[graph.EdgeT] = pairsOfRules.flatMap(e => { val node1 = graph get e._1 val node2 = graph get e._2 node1.outgoing.filter(_.target == node2) - }).flatten + }) debug("Edges: " + edges.mkString(", ")) // map to edge labels, i.e. the predicates @@ -325,9 +328,14 @@ abstract class RuleDependencyGraphMinimizer extends MinimizationRuleExecutor { if (samePred) Some(predicates(0), edges) else None }).filter(_.isDefined).map(_.get).groupBy(e => e._1).mapValues(e => e.map(x => x._2).toList) - var removedCycles: collection.mutable.Set[Buffer[graph.EdgeT]] = collection.mutable.Set() + var removedCycles: collection.mutable.Set[mutable.Buffer[graph.EdgeT]] = collection.mutable.Set() - val tmp: Map[Node, Map[Int, List[Buffer[graph.EdgeT]]]] = cyclesWithNodeSameProp.mapValues(value => value.map(cycle => (cycle.size, cycle)).groupBy(_._1).mapValues(e => e.map(x => x._2).toList)) + val tmp: Map[Node, Map[Int, List[mutable.Buffer[graph.EdgeT]]]] = + cyclesWithNodeSameProp + .mapValues(value => + value.map(cycle => (cycle.size, cycle)) + .groupBy(_._1) + .mapValues(e => e.map(x => x._2))) tmp.foreach(predicate2Cycles => { debug("predicate: " + predicate2Cycles._1) diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimpleCalciteConnection.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimpleCalciteConnection.scala index 3e246e0..acae963 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimpleCalciteConnection.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimpleCalciteConnection.scala @@ -8,7 +8,7 @@ import java.util.concurrent.Executor import org.apache.calcite.adapter.java.JavaTypeFactory import org.apache.calcite.config.CalciteConnectionConfig -import org.apache.calcite.jdbc.CalciteConnection +import org.apache.calcite.jdbc.{CalciteConnection, CalcitePrepare} import org.apache.calcite.linq4j.tree.Expression import org.apache.calcite.linq4j.{Enumerator, Queryable} import org.apache.calcite.schema.SchemaPlus @@ -150,4 +150,6 @@ class SimpleCalciteConnection extends CalciteConnection{ override def execute[T](expression: Expression, `type`: Type): T = null.asInstanceOf[T] override def executeQuery[T](queryable: Queryable[T]): Enumerator[T] = null + + override def createPrepareContext(): CalcitePrepare.Context = null } diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimplePlanGenerator.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimplePlanGenerator.scala index b35fbe8..75e1a26 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimplePlanGenerator.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimplePlanGenerator.scala @@ -1,30 +1,20 @@ package net.sansa_stack.inference.rules.plan -import java.io.PrintWriter -import java.util.Collections +import scala.collection.JavaConverters._ +import scala.util.Try -import com.google.common.collect.ImmutableList import org.apache.calcite.config.Lex +import org.apache.calcite.interpreter.{BindableConvention, Bindables} import org.apache.calcite.plan.{RelOptUtil, _} +import org.apache.calcite.rel.`type`.RelDataTypeSystem +import org.apache.calcite.rel.rules._ import org.apache.calcite.rel.{RelCollationTraitDef, RelNode} import org.apache.calcite.schema.SchemaPlus import org.apache.calcite.sql.parser.SqlParser import org.apache.calcite.tools._ -import collection.JavaConverters._ -import scala.util.Try - -import org.apache.calcite.rel.`type`.RelDataTypeSystem -import org.apache.calcite.rel.externalize.RelWriterImpl -import org.apache.calcite.rel.rules._ import org.apache.jena.reasoner.rulesys.Rule import net.sansa_stack.inference.utils.{Logging, RuleUtils} -import org.apache.calcite.adapter.enumerable.{EnumerableConvention, EnumerableRules} -import org.apache.calcite.interpreter.{BindableConvention, Bindables} -import org.apache.calcite.plan.RelOptPlanner.CannotPlanException -import org.apache.calcite.plan.hep.{HepMatchOrder, HepPlanner, HepProgramBuilder} -import org.apache.calcite.plan.volcano.VolcanoPlanner -import org.apache.calcite.sql2rel.{RelDecorrelator, SqlToRelConverter} /** * @author Lorenz Buehmann @@ -37,7 +27,7 @@ class SimplePlanGenerator(schema: SchemaPlus) extends Logging { BindableConvention.INSTANCE.getTraitDef ) - val optRuleSet = RuleSets.ofList( + val optRuleSet: RuleSet = RuleSets.ofList( FilterJoinRule.FILTER_ON_JOIN,// push a filter into a join FilterJoinRule.JOIN,// push filter into the children of a join ProjectJoinTransposeRule.INSTANCE// push a projection to the children of a join @@ -68,13 +58,13 @@ class SimplePlanGenerator(schema: SchemaPlus) extends Logging { // // Context provides a way to store data within the planner session that can be accessed in planner rules. // .context(Contexts.EMPTY_CONTEXT) // // Rule sets to use in transformation phases. Each transformation phase can use a different set of rules. -//// .ruleSets(optRuleSet) +// // .ruleSets(optRuleSet) // .ruleSets(RuleSets.ofList(Bindables.BINDABLE_TABLE_SCAN_RULE, Bindables.BINDABLE_PROJECT_RULE, Bindables.BINDABLE_JOIN_RULE, Bindables.BINDABLE_FILTER_RULE, FilterJoinRule.FILTER_ON_JOIN)) // .programs(Programs.ofRules(Bindables.BINDABLE_TABLE_SCAN_RULE, Bindables.BINDABLE_PROJECT_RULE, Bindables.BINDABLE_JOIN_RULE, Bindables.BINDABLE_FILTER_RULE, FilterJoinRule.FILTER_ON_JOIN)) // // // Custom cost factory to use during optimization // .costFactory(null) -//// .programs(program) +// // .programs(program) // .typeSystem(RelDataTypeSystem.DEFAULT) // .build() diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimpleRelBuilder.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimpleRelBuilder.scala index fc3f415..4b24037 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimpleRelBuilder.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/SimpleRelBuilder.scala @@ -1,10 +1,14 @@ package net.sansa_stack.inference.rules.plan +import java.util.Properties + +import org.apache.calcite.config.{CalciteConnectionConfig, CalciteConnectionConfigImpl, CalciteConnectionProperty} import org.apache.calcite.jdbc.CalciteSchema import org.apache.calcite.plan.{Context, RelOptCluster, RelOptPlanner, RelOptSchema} import org.apache.calcite.prepare.CalciteCatalogReader import org.apache.calcite.rex.RexBuilder import org.apache.calcite.schema.SchemaPlus +import org.apache.calcite.sql.parser.SqlParser import org.apache.calcite.tools.Frameworks.PlannerAction import org.apache.calcite.tools.{FrameworkConfig, Frameworks, RelBuilder} @@ -56,11 +60,18 @@ object SimpleRelBuilder { val calciteSchema = CalciteSchema.from(config.getDefaultSchema) val relOptSchema = new CalciteCatalogReader( calciteSchema, - config.getParserConfig.caseSensitive(), defaultRelOptSchema.getSchemaPaths.get(0), - typeFactory) + typeFactory, + connectionConfig(config.getParserConfig)) new SimpleRelBuilder(config.getContext, cluster, relOptSchema) } + def connectionConfig(parserConfig : SqlParser.Config): CalciteConnectionConfig = { + val prop = new Properties() + prop.setProperty(CalciteConnectionProperty.CASE_SENSITIVE.camelName, + String.valueOf(parserConfig.caseSensitive)) + new CalciteConnectionConfigImpl(prop) + } + } diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/TriplesTableFactory.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/TriplesTableFactory.scala index 330e0bd..663f504 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/TriplesTableFactory.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/rules/plan/TriplesTableFactory.scala @@ -5,11 +5,13 @@ import java.util import scala.collection.JavaConverters._ import org.apache.calcite.DataContext +import org.apache.calcite.config.CalciteConnectionConfig import org.apache.calcite.linq4j.{Enumerable, Linq4j} import org.apache.calcite.rel.`type`.{RelDataType, RelDataTypeFactory, RelProtoDataType} import org.apache.calcite.rex.RexNode import org.apache.calcite.schema.Schema.TableType import org.apache.calcite.schema._ +import org.apache.calcite.sql.{SqlCall, SqlNode} import org.apache.calcite.sql.`type`.SqlTypeName /** @@ -53,6 +55,10 @@ class TriplesTableFactory extends TableFactory[Table] { override def getRowType(typeFactory: RelDataTypeFactory): RelDataType = protoRowType.apply(typeFactory) + override def isRolledUp(s: String): Boolean = false + override def rolledUpColumnValidInsideAgg(s: String, sqlCall: SqlCall, sqlNode: SqlNode, + calciteConnectionConfig: CalciteConnectionConfig): Boolean = false } + } diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/GraphUtils.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/GraphUtils.scala index 0d398db..fbea2fb 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/GraphUtils.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/GraphUtils.scala @@ -1,14 +1,14 @@ package net.sansa_stack.inference.utils import java.io.{ByteArrayOutputStream, File, FileOutputStream, FileWriter} - -import scalax.collection.edge.LDiEdge +import java.util import com.itextpdf.text.PageSize import org.apache.jena.graph.Node import org.apache.jena.reasoner.TriplePattern import org.apache.jena.reasoner.rulesys.Rule import org.apache.jena.shared.PrefixMapping +import org.apache.jena.sparql.util.FmtUtils import org.gephi.graph.api.GraphController import org.gephi.io.exporter.api.ExportController import org.gephi.io.exporter.preview.PDFExporter @@ -19,13 +19,15 @@ import org.gephi.layout.plugin.force.yifanHu.YifanHuLayout import org.gephi.preview.api.{Item, PreviewController, PreviewProperty} import org.gephi.preview.types.EdgeColor import org.gephi.project.api.ProjectController -import org.jgrapht.DirectedGraph +import org.jgrapht.Graph import org.jgrapht.alg.isomorphism.VF2GraphIsomorphismInspector -import org.jgrapht.ext._ -import org.jgrapht.graph._ +import org.jgrapht.graph.{DefaultDirectedGraph, DirectedPseudograph} +import org.jgrapht.io.GraphMLExporter.AttributeCategory +import org.jgrapht.io._ import org.openide.util.Lookup +import scalax.collection.edge.LDiEdge -import net.sansa_stack.inference.utils.graph.{EdgeEquivalenceComparator, LabeledEdge, NodeEquivalenceComparator}; +import net.sansa_stack.inference.utils.graph.{EdgeEquivalenceComparator, LabeledEdge, NodeEquivalenceComparator} /** * @author Lorenz Buehmann @@ -52,8 +54,8 @@ object GraphUtils { * @param graph the 'Graph for Scala' graph * @return the JGraphT graph */ - def asJGraphtRuleGraph(graph: scalax.collection.mutable.Graph[Node, LDiEdge]): DirectedGraph[Node, LabeledEdge[Node, String]] = { - val g: DirectedGraph[Node, LabeledEdge[Node, String]] = new DefaultDirectedGraph[Node, LabeledEdge[Node, String]](classOf[LabeledEdge[Node, String]]) + def asJGraphtRuleGraph(graph: scalax.collection.mutable.Graph[Node, LDiEdge]): Graph[Node, LabeledEdge[Node, String]] = { + val g: Graph[Node, LabeledEdge[Node, String]] = new DefaultDirectedGraph[Node, LabeledEdge[Node, String]](classOf[LabeledEdge[Node, String]]) val edges = graph.edges.toList @@ -89,7 +91,8 @@ object GraphUtils { * @param graph the 'Graph for Scala' graph * @return the JGraphT graph */ - def asJGraphtRuleSetGraph(graph: scalax.collection.mutable.Graph[Rule, LDiEdge]): DirectedGraph[Rule, LabeledEdge[Rule, TriplePattern]] = { + def asJGraphtRuleSetGraph(graph: scalax.collection.mutable.Graph[Rule, LDiEdge], + showInFlowDirection: Boolean = false): Graph[Rule, LabeledEdge[Rule, TriplePattern]] = { val g = new DefaultDirectedGraph[Rule, LabeledEdge[Rule, TriplePattern]](classOf[LabeledEdge[Rule, TriplePattern]]) val edges = graph.edges.toList @@ -102,7 +105,11 @@ object GraphUtils { val label = e.label.asInstanceOf[TriplePattern] - g.addEdge(s, t, LabeledEdge[Rule, TriplePattern](s, t, label)) + if (showInFlowDirection) { + g.addEdge(t, s, LabeledEdge[Rule, TriplePattern](t, s, label)) + } else { + g.addEdge(s, t, LabeledEdge[Rule, TriplePattern](s, t, label)) + } } @@ -117,9 +124,10 @@ object GraphUtils { * * @param filename the target file */ - def export(filename: String, showInFlowDirection: Boolean = false): Unit = { + def export(filename: String, showInFlowDirection: Boolean = false, + prefixMapping: PrefixMapping = PrefixMapping.Standard): Unit = { - val g: DirectedGraph[Rule, LabeledEdge[Rule, TriplePattern]] = asJGraphtRuleSetGraph(graph) + val g: Graph[Rule, LabeledEdge[Rule, TriplePattern]] = asJGraphtRuleSetGraph(graph, showInFlowDirection) // In order to be able to export edge and node labels and IDs, // we must implement providers for them @@ -138,17 +146,38 @@ object GraphUtils { } val edgeLabelProvider = new ComponentNameProvider[LabeledEdge[Rule, TriplePattern]]() { - override def getName(e: LabeledEdge[Rule, TriplePattern]): String = e.label.toString + override def getName(e: LabeledEdge[Rule, TriplePattern]): String = { + val p = e.label.getPredicate + // omit if predicate is a variable + if(p.isVariable) { + "" + } else { + FmtUtils.stringForNode(e.label.getPredicate, prefixMapping) + } + } } -// val exporter = new GraphMLExporter[String,LabeledEdge]( + import org.jgrapht.io.DefaultAttribute + val ruleDescriptionProvider = new ComponentAttributeProvider[Rule]() { + override def getComponentAttributes(r: Rule): util.Map[String, Attribute] = { + val map = new util.HashMap[String, Attribute]() + map.put("rule", DefaultAttribute.createAttribute(r.toString)) + map + } + } + + // val exporter = new GraphMLExporter[String,LabeledEdge]( // vertexIDProvider, vertexNameProvider, edgeIDProvider,edgeLabelProvider) val exporter = new GraphMLExporter[Rule, LabeledEdge[Rule, TriplePattern]]( new IntegerComponentNameProvider[Rule], vertexNameProvider, + ruleDescriptionProvider, new IntegerComponentNameProvider[LabeledEdge[Rule, TriplePattern]], - edgeLabelProvider) + edgeLabelProvider, + null) + + exporter.registerAttribute("rule", AttributeCategory.NODE, AttributeType.STRING) val fw = new FileWriter(filename) @@ -159,12 +188,12 @@ object GraphUtils { // Gephi // Init a project - and therefore a workspace - val pc = Lookup.getDefault().lookup(classOf[ProjectController]) + val pc = Lookup.getDefault.lookup(classOf[ProjectController]) pc.newProject() - val workspace = pc.getCurrentWorkspace() + val workspace = pc.getCurrentWorkspace // Get controllers and models - val importController = Lookup.getDefault().lookup(classOf[ImportController]) + val importController = Lookup.getDefault.lookup(classOf[ImportController]) // export as GraphML val tmpFilename = "/tmp/temp-graph.graphml" @@ -173,8 +202,8 @@ object GraphUtils { // Import file val file = new File(tmpFilename) val container = importController.importFile(file) - container.getLoader().setEdgeDefault(EdgeDirectionDefault.DIRECTED) // Force DIRECTED - container.getLoader().setAllowAutoNode(false) // Don't create missing nodes + container.getLoader.setEdgeDefault(EdgeDirectionDefault.DIRECTED) // Force DIRECTED + container.getLoader.setAllowAutoNode(false) // Don't create missing nodes // Append imported data to GraphAPI importController.process(container, new DefaultProcessor(), workspace) @@ -184,7 +213,7 @@ object GraphUtils { // See if graph is well imported - val graphModel = Lookup.getDefault().lookup(classOf[GraphController]).getGraphModel + val graphModel = Lookup.getDefault.lookup(classOf[GraphController]).getGraphModel val g = graphModel.getDirectedGraph() // Run YifanHuLayout for 100 passes - The layout always takes the current visible view @@ -194,23 +223,23 @@ object GraphUtils { layout.setOptimalDistance(200f) layout.initAlgo() - for (i <- 0 to 100 if layout.canAlgo()) { + for (i <- 0 to 100 if layout.canAlgo) { layout.goAlgo() } layout.endAlgo() - val model = Lookup.getDefault().lookup(classOf[PreviewController]).getModel() - model.getProperties().putValue(PreviewProperty.SHOW_NODE_LABELS, true) - model.getProperties().putValue(PreviewProperty.SHOW_EDGE_LABELS, true) - model.getProperties().putValue(PreviewProperty.EDGE_CURVED, false) - model.getProperties().putValue(PreviewProperty.EDGE_COLOR, new EdgeColor(java.awt.Color.GRAY)) - model.getProperties().putValue(PreviewProperty.EDGE_THICKNESS, 0.1f) - model.getProperties().putValue(PreviewProperty.NODE_LABEL_FONT, model.getProperties().getFontValue(PreviewProperty.NODE_LABEL_FONT).deriveFont(8)) + val model = Lookup.getDefault.lookup(classOf[PreviewController]).getModel() + model.getProperties.putValue(PreviewProperty.SHOW_NODE_LABELS, true) + model.getProperties.putValue(PreviewProperty.SHOW_EDGE_LABELS, true) + model.getProperties.putValue(PreviewProperty.EDGE_CURVED, false) + model.getProperties.putValue(PreviewProperty.EDGE_COLOR, new EdgeColor(java.awt.Color.GRAY)) + model.getProperties.putValue(PreviewProperty.EDGE_THICKNESS, 0.1f) + model.getProperties.putValue(PreviewProperty.NODE_LABEL_FONT, model.getProperties.getFontValue(PreviewProperty.NODE_LABEL_FONT).deriveFont(8)) model.getProperties.putValue(Item.NODE_LABEL, "Vertex Label") // Export full graph - val ec = Lookup.getDefault().lookup(classOf[ExportController]) + val ec = Lookup.getDefault.lookup(classOf[ExportController]) // ec.exportFile(new File("io_gexf.gexf")); // PDF Exporter config and export to Byte array @@ -219,7 +248,7 @@ object GraphUtils { pdfExporter.setWorkspace(workspace) val baos = new ByteArrayOutputStream() ec.exportStream(baos, pdfExporter) - new FileOutputStream(filename + ".pdf").write(baos.toByteArray()) + new FileOutputStream(filename + ".pdf").write(baos.toByteArray) } } @@ -302,7 +331,7 @@ object GraphUtils { */ def export(filename: String): Unit = { - val g: DirectedGraph[Node, LabeledEdge[Node, Node]] = new DirectedPseudograph[Node, LabeledEdge[Node, Node]](classOf[LabeledEdge[Node, Node]]) + val g: Graph[Node, LabeledEdge[Node, Node]] = new DirectedPseudograph[Node, LabeledEdge[Node, Node]](classOf[LabeledEdge[Node, Node]]) val edges = graph.edges.toList @@ -312,7 +341,7 @@ object GraphUtils { val label = e.label.asInstanceOf[Node] g.addVertex(s) g.addVertex(t) - g.addEdge(s, t, new LabeledEdge(s, t, label)) + g.addEdge(s, t, LabeledEdge(s, t, label)) } // In order to be able to export edge and node labels and IDs, diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/JenaTripleToNTripleString.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/JenaTripleToNTripleString.scala index 46d8df6..b5dee64 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/JenaTripleToNTripleString.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/JenaTripleToNTripleString.scala @@ -9,7 +9,7 @@ import org.apache.jena.graph.Triple * @author Lorenz Buehmann */ class JenaTripleToNTripleString - extends ((Triple) => String) + extends Function[Triple, String] with java.io.Serializable { override def apply(t: Triple): String = { val subStr = @@ -27,7 +27,7 @@ class JenaTripleToNTripleString } else { s"<${t.getObject}>" } - s"${subStr} <${t.getPredicate}> ${objStr} ." + s"$subStr <${t.getPredicate}> $objStr ." } } diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/Logging.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/Logging.scala index 4b75d32..e3f4ecf 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/Logging.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/Logging.scala @@ -11,7 +11,7 @@ import scala.language.implicitConversions */ trait Logging { - @transient private var log_ : Logger = null + @transient private var log_ : Logger = _ // Method to get or create the logger for this object protected def log: Logger = { @@ -22,7 +22,7 @@ trait Logging { } // Method to get the logger name for this object - protected def logName = { + protected def logName: String = { // Ignore trailing $'s in the class names for Scala objects this.getClass.getName.stripSuffix("$") } diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/NTriplesStringToJenaTriple.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/NTriplesStringToJenaTriple.scala index f8a75d2..f46481d 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/NTriplesStringToJenaTriple.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/NTriplesStringToJenaTriple.scala @@ -11,7 +11,7 @@ import org.apache.jena.riot.{Lang, RDFDataMgr} * @author Lorenz Buehmann */ class NTriplesStringToJenaTriple - extends Function1[String, Triple] + extends Function[String, Triple] with java.io.Serializable { override def apply(s: String): Triple = { RDFDataMgr.createIteratorTriples(new ByteArrayInputStream(s.getBytes), Lang.NTRIPLES, null).next() diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/RDFTripleToNTripleString.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/RDFTripleToNTripleString.scala index 634b1a8..33e1d1b 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/RDFTripleToNTripleString.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/RDFTripleToNTripleString.scala @@ -8,7 +8,7 @@ import net.sansa_stack.inference.data.RDFTriple * @author Lorenz Buehmann */ class RDFTripleToNTripleString - extends Function1[RDFTriple, String] + extends Function[RDFTriple, String] with java.io.Serializable { override def apply(t: RDFTriple): String = { val objStr = @@ -17,6 +17,6 @@ class RDFTripleToNTripleString } else { t.o } - s"<${t.s}> <${t.p}> ${objStr} ." + s"<${t.s}> <${t.p}> $objStr ." } } diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/RuleUtils.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/RuleUtils.scala index 5c0127b..fea7ba2 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/RuleUtils.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/RuleUtils.scala @@ -226,7 +226,7 @@ object RuleUtils { // get the path in body graph val s = (bodyGraph get source).withSubgraph(edges = e => property == null || e.label.equals(property)) - val t = (bodyGraph get target) + val t = bodyGraph get target val path = s pathTo t @@ -299,7 +299,8 @@ object RuleUtils { // predicates that are contained in body and head val intersection = bodyPredicates.intersect(headPredicates) - ruleType match { + // 1. check whether there is an overlap between body and head predicates (might not work) + val cyclic = ruleType match { case TERMINOLOGICAL => // check if there is at least one predicate that occurs in body and head val bodyPredicates = rule.getBody @@ -331,6 +332,8 @@ object RuleUtils { } + // 2. use JGraphT instead and compute cycles + // we generate a graph for the rule (we use a JGraphT graph which provides better cycle detection) val g = GraphUtils.asJGraphtRuleGraph(asGraph(rule)) @@ -356,9 +359,7 @@ object RuleUtils { * @param filename the file * @return a set of rules */ - def load(filename: String): Seq[Rule] = { - Rule.parseRules(org.apache.jena.reasoner.rulesys.Util.loadRuleParserFromResourceFile(filename)).asScala.toSeq - } + def load(filename: String): Seq[Rule] = Rule.parseRules(org.apache.jena.reasoner.rulesys.Util.loadRuleParserFromResourceFile(filename)).asScala /** * Returns a rule by the given name from a set of rules. @@ -423,14 +424,14 @@ object RuleUtils { } /** - * Returns `true` if `rule1 has the same body as `rule2`, otherwise `false` . + * Returns `true` if `rule1` has the same body as `rule2`, otherwise `false` . */ def sameBody(rule1: Rule, rule2: Rule): Boolean = { GraphUtils.areIsomorphic(graphOfBody(rule1), graphOfBody(rule2)) } /** - * Returns `true` if `rule1 has the same head as `rule2`, otherwise `false`. + * Returns `true` if `rule1` has the same head as `rule2`, otherwise `false`. */ def sameHead(rule1: Rule, rule2: Rule): Boolean = { GraphUtils.areIsomorphic(graphOfHead(rule1), graphOfHead(rule2)) diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/TriplePatternOrdering.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/TriplePatternOrdering.scala index a3f023f..39b91bb 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/TriplePatternOrdering.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/TriplePatternOrdering.scala @@ -9,7 +9,7 @@ import org.apache.jena.sparql.util.NodeComparator * @author Lorenz Buehmann */ class TriplePatternOrdering extends Ordering[TriplePattern]{ - implicit val comp = new NodeComparator + implicit val comp: NodeComparator = new NodeComparator override def compare(x: TriplePattern, y: TriplePattern): Int = { Ordering.by{t: TriplePattern => (t.getSubject, t.getPredicate, t.getObject)}.compare(x, y) diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/TripleUtils.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/TripleUtils.scala index cf74b36..a131789 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/TripleUtils.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/TripleUtils.scala @@ -15,14 +15,14 @@ import org.apache.jena.vocabulary.RDFS._ object TripleUtils { // set of properties that indicate terminological triples - val properties = List( + val properties: List[Node] = List( subClassOf, equivalentClass, disjointWith, intersectionOf, unionOf, complementOf, someValuesFrom, allValuesFrom, hasValue, maxCardinality, minCardinality, cardinality, subPropertyOf, equivalentProperty, propertyDisjointWith, domain, range, inverseOf).map(t => t.asNode()) // set of types that indicate terminological triples - val types = Set( + val types: Set[Node] = Set( ObjectProperty, DatatypeProperty, FunctionalProperty, InverseFunctionalProperty, SymmetricProperty, AsymmetricProperty, @@ -115,7 +115,7 @@ object TripleUtils { * @return all var nodes of the triple pattern */ def vars(): Seq[Node] = { - nodes.filter(_.isVariable) + nodes().filter(_.isVariable) } } diff --git a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/graph/LabeledEdge.scala b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/graph/LabeledEdge.scala index 7cc0603..adf6925 100644 --- a/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/graph/LabeledEdge.scala +++ b/sansa-inference-common/src/main/scala/net/sansa_stack/inference/utils/graph/LabeledEdge.scala @@ -3,6 +3,11 @@ package net.sansa_stack.inference.utils.graph import org.jgrapht.graph.DefaultEdge /** + * A labeled edge that also keeps trakc of source and target node. + * + * @param s source node + * @param t target node + * @param label the label * @author Lorenz Buehmann */ case class LabeledEdge[V, L](s: V, t: V, label: L) extends DefaultEdge {} diff --git a/sansa-inference-common/src/test/scala/net/sansa_stack/inference/common/DependencyGraphTest.scala b/sansa-inference-common/src/test/scala/net/sansa_stack/inference/common/DependencyGraphTest.scala index 48ddcc0..38964f7 100644 --- a/sansa-inference-common/src/test/scala/net/sansa_stack/inference/common/DependencyGraphTest.scala +++ b/sansa-inference-common/src/test/scala/net/sansa_stack/inference/common/DependencyGraphTest.scala @@ -1,41 +1,95 @@ package net.sansa_stack.inference.common +import java.nio.file.{Path, Paths} + import net.sansa_stack.inference.rules._ import net.sansa_stack.inference.rules.minimizer.DefaultRuleDependencyGraphMinimizer import net.sansa_stack.inference.utils.GraphUtils._ import net.sansa_stack.inference.utils.RuleUtils /** + * Computes a given set of rules and exports its rule dependency graph before and after minimization. + * + * * @author Lorenz Buehmann */ object DependencyGraphTest { + // the config object + case class Config(in: Path = null, + out: Path = null, + profile: String = "", + ruleNames: Seq[String] = Seq() + ) + + implicit val pathRead: scopt.Read[Path] = + scopt.Read.reads(Paths.get(_)) + + // the CLI parser + val parser = new scopt.OptionParser[Config]("DependencyGraphTest") { + + head("DependencyGraphTest", "0.1.0") + + opt[Path]('i', "input").required().valueName(""). + action((x, c) => c.copy(in = x)). + text("path to file containing the rules") + + opt[Path]('o', "out").required().valueName(""). + action((x, c) => c.copy(out = x)). + text("the output directory") + + opt[String]('p', "profile").required().valueName(""). + action((x, c) => c.copy(profile = x)). + text("the name of the set of rules to process - will be used for output files") + + opt[Seq[String]]("rules").optional().valueName(",,..."). + action((x, c) => { + c.copy(ruleNames = x) + }). + text("list of rule names to process just a subset of the rules contained in the given input file") + } + def main(args: Array[String]): Unit = { - val path = "/tmp" + parser.parse(args, Config()) match { + case Some(config) => + run(config) + case None => + // scalastyle:off println + println(parser.usage) + // scalastyle:on println + } + } + + def run(config: Config): Unit = { + + // make output dirs + config.out.toFile.mkdirs() + + // load the rules + var rules = RuleUtils.load(config.in.toAbsolutePath.toString) + + // filter if necessary + if(config.ruleNames.nonEmpty) { + rules = rules.filter(r => config.ruleNames.contains(r.getName)) + } // val names = Seq("rdfp13a", "rdfp13b", "rdfp13c", "rdfs5", "rdfs7") // property rules - val names = Seq("rdfp13a", "rdfp13b", "rdfp13c", "rdfs5", "rdfs7", "rdfp3", "rdfp4") // property rules + some instance rules + val names = Seq("rdfp13a", "rdfp13b", "rdfp13c")// , "rdfs5", "rdfs7", "rdfp3", "rdfp4") // property rules + some instance rules // val names = Seq("rdfs5", "rdfs7", "rdfp3", "rdfp4") // property TC rule + some instance rules - // define the rules - val rules = RuleSets.OWL_HORST//.filter(r => names.contains(r.getName)) - val profile = ReasoningProfile.OWL_HORST -// val rules = RuleSets.RDFS_SIMPLE -// val profile = ReasoningProfile.RDFS_SIMPLE - val minimizer = new DefaultRuleDependencyGraphMinimizer() - // export graphs - rules.foreach(rule => RuleUtils.asGraph(rule).export(s"${path}/rule-${rule.getName}.graphml")) + // export graphs for each rule + rules.foreach(rule => RuleUtils.asGraph(rule).export(config.out.resolve(s"rule_${rule.getName}.graphml").toAbsolutePath.toString)) // generate the rule dependency graph - var dependencyGraph = RuleDependencyGraphGenerator.generate(rules) - dependencyGraph.export(s"${path}/rdg-${profile}.graphml") + var dependencyGraph = RuleDependencyGraphGenerator.generate(rules.toSet) + dependencyGraph.export(config.out.resolve(s"rdg_${config.profile}.graphml").toAbsolutePath.toString, showInFlowDirection = true) + // generate the minimized graph dependencyGraph = minimizer.execute(dependencyGraph) // RuleDependencyGraphGenerator.generate(rules, pruned = true) - dependencyGraph.export(s"${path}/rdg-${profile}-pruned.graphml") -// dependencyGraph.exportAsPDF(s"${path}/rdg-${profile}-pruned.pdf") + dependencyGraph.export(config.out.resolve(s"rdg_${config.profile}_minimized.graphml").toAbsolutePath.toString, showInFlowDirection = true) // generate the high-level dependency graph val highLevelDependencyGraph = HighLevelRuleDependencyGraphGenerator.generate(dependencyGraph) diff --git a/sansa-inference-flink/pom.xml b/sansa-inference-flink/pom.xml index 4632164..013c9ab 100644 --- a/sansa-inference-flink/pom.xml +++ b/sansa-inference-flink/pom.xml @@ -23,12 +23,12 @@ under the License. net.sansa-stack sansa-inference-parent_2.11 - 0.3.0 + 0.4.0 ../pom.xml net.sansa-stack sansa-inference-flink_${scala.binary.version} - 0.3.0 + 0.4.0 Inference API - Flink Apache Flink based inference layer for RDF and OWL diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala index 7960d41..66c64a8 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraph.scala @@ -68,7 +68,7 @@ case class RDFGraph(triples: DataSet[RDFTriple]) { * * @return the number of triples */ - def size() = { + def size(): Long = { triples.count() } } diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala index e9eafdb..68db1f1 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerOWLHorst.scala @@ -50,9 +50,9 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule .distinct() // 2. we compute the transitive closure of rdfs:subPropertyOf and rdfs:subClassOf - // rdfs11: (xxx rdfs:subClassOf yyy), (yyy rdfs:subClassOf zzz) -> (xxx rdfs:subClassOf zzz) + // rdfs11: (xxx rdfs:subClassOf yyy), (yyy rdfs:subClassOf zzz) -> (xxx rdfs:subClassOf zzz) val subClassOfTriplesTrans = computeTransitiveClosure(subClassOfTriples) - // rdfs5: (xxx rdfs:subPropertyOf yyy), (yyy rdfs:subPropertyOf zzz) -> (xxx rdfs:subPropertyOf zzz) + // rdfs5: (xxx rdfs:subPropertyOf yyy), (yyy rdfs:subPropertyOf zzz) -> (xxx rdfs:subPropertyOf zzz) val subPropertyOfTriplesTrans = computeTransitiveClosure(subPropertyOfTriples) @@ -159,8 +159,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // 2. SubPropertyOf inheritance according to rdfs7 is computed /* - rdfs7 aaa rdfs:subPropertyOf bbb . - xxx aaa yyy . xxx bbb yyy . + rdfs7 aaa rdfs:subPropertyOf bbb . + xxx aaa yyy . => xxx bbb yyy . */ val triplesRDFS7 = triplesFiltered @@ -173,8 +173,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // 3. Domain and Range inheritance according to rdfs2 and rdfs3 is computed /* - rdfs2 aaa rdfs:domain xxx . - yyy aaa zzz . yyy rdf:type xxx . + rdfs2 aaa rdfs:domain xxx . + yyy aaa zzz . => yyy rdf:type xxx . */ val triplesRDFS2 = rdfs7Res @@ -182,8 +182,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule .map(t => RDFTriple(t.s, RDF.`type`.getURI, domainMap(t.p))) /* - rdfs3 aaa rdfs:range xxx . - yyy aaa zzz . zzz rdf:type xxx . + rdfs3 aaa rdfs:range xxx . + yyy aaa zzz . => zzz rdf:type xxx . */ val triplesRDFS3 = rdfs7Res @@ -195,8 +195,8 @@ class ForwardRuleReasonerOWLHorst(env: ExecutionEnvironment) extends ForwardRule // input are the rdf:type triples from RDFS2/RDFS3 and the ones contained in the original graph /* - rdfs9 xxx rdfs:subClassOf yyy . - zzz rdf:type xxx . zzz rdf:type yyy . + rdfs9 xxx rdfs:subClassOf yyy . + zzz rdf:type xxx . => zzz rdf:type yyy . */ val triplesRDFS9 = triplesRDFS2 diff --git a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala index 4d0a418..88d2c95 100644 --- a/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala +++ b/sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/forwardchaining/ForwardRuleReasonerRDFS.scala @@ -60,8 +60,8 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas computeTransitiveClosureOptSemiNaive(subClassOfTriples).name("rdfs11") /* - rdfs5 xxx rdfs:subPropertyOf yyy . - yyy rdfs:subPropertyOf zzz . xxx rdfs:subPropertyOf zzz . + rdfs5 xxx rdfs:subPropertyOf yyy . + yyy rdfs:subPropertyOf zzz . => xxx rdfs:subPropertyOf zzz . */ val subPropertyOfTriples = extractTriples(schemaTriples, RDFS.subPropertyOf.getURI) @@ -77,8 +77,8 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas // 2. SubPropertyOf inheritance according to rdfs7 is computed /* - rdfs7 aaa rdfs:subPropertyOf bbb . - xxx aaa yyy . xxx bbb yyy . + rdfs7 aaa rdfs:subPropertyOf bbb . + xxx aaa yyy . => xxx bbb yyy . */ val triplesRDFS7 = if (useSchemaBroadCasting) { otherTriples @@ -124,8 +124,8 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas // 3. Domain and Range inheritance according to rdfs2 and rdfs3 is computed /* - rdfs2 aaa rdfs:domain xxx . - yyy aaa zzz . yyy rdf:type xxx . + rdfs2 aaa rdfs:domain xxx . + yyy aaa zzz . => yyy rdf:type xxx . */ val domainTriples = extractTriples(schemaTriples, RDFS.domain.getURI).name("rdfs:domain") @@ -168,8 +168,8 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas }.name("rdfs2") /* - rdfs3 aaa rdfs:range xxx . - yyy aaa zzz . zzz rdf:type xxx . + rdfs3 aaa rdfs:range xxx . + yyy aaa zzz . => zzz rdf:type xxx . */ val rangeTriples = extractTriples(schemaTriples, RDFS.range.getURI).name("rdfs:range") @@ -220,8 +220,8 @@ class ForwardRuleReasonerRDFS(env: ExecutionEnvironment) extends ForwardRuleReas // 4. SubClass inheritance according to rdfs9 /* - rdfs9 xxx rdfs:subClassOf yyy . - zzz rdf:type xxx . zzz rdf:type yyy . + rdfs9 xxx rdfs:subClassOf yyy . + zzz rdf:type xxx . => zzz rdf:type yyy . */ val triplesRDFS9 = if (useSchemaBroadCasting) { typeTriples // all rdf:type triples (s a A) diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/TCTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/TCTest.scala index 9f0dc01..33c66aa 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/TCTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/TCTest.scala @@ -1,26 +1,25 @@ package net.sansa_stack.inference.flink +import scala.collection.mutable + import org.apache.flink.api.common.functions.RichJoinFunction import org.apache.flink.api.common.operators.Order -import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment} +import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment, _} import org.apache.flink.core.fs.FileSystem.WriteMode -import org.apache.flink.test.util.{MultipleProgramsTestBase, TestBaseUtils} import org.apache.flink.test.util.MultipleProgramsTestBase.TestExecutionMode +import org.apache.flink.test.util.{MultipleProgramsTestBase, TestBaseUtils} import org.apache.flink.util.Collector import org.apache.jena.vocabulary.RDFS -import org.junit.{After, Before, Rule, Test} import org.junit.rules.TemporaryFolder -import scala.collection.mutable - -import org.apache.flink.api.scala._ import org.junit.runner.RunWith import org.junit.runners.Parameterized +import org.junit.{After, Before, Rule, Test} import net.sansa_stack.inference.data.RDFTriple -import net.sansa_stack.inference.flink.forwardchaining.TransitiveReasoner /** * A test case for the computation of the transitive closure (TC). + * * @author Lorenz Buehmann */ @RunWith(classOf[Parameterized]) @@ -35,7 +34,7 @@ class TCTest(mode: TestExecutionMode) extends MultipleProgramsTestBase(mode) { private var expectedResult: String = "" @Rule - def tempFolder = _tempFolder + def tempFolder: TemporaryFolder = _tempFolder @Before def before(): Unit = { @@ -123,12 +122,17 @@ class TCTest(mode: TestExecutionMode) extends MultipleProgramsTestBase(mode) { def performOptimized(triples: DataSet[RDFTriple]): DataSet[(String, String)] = { def iterate(s: DataSet[RDFTriple], ws: DataSet[RDFTriple]): (DataSet[RDFTriple], DataSet[RDFTriple]) = { val resolvedRedirects = triples.join(ws) - .where { _.s } - .equalTo { _.o } - .map { joinResult => joinResult match { - case (redirect, link) => - RDFTriple(link.s, redirect.p, redirect.o) + .where { + _.s } + .equalTo { + _.o + } + .map { joinResult => + joinResult match { + case (redirect, link) => + RDFTriple(link.s, redirect.p, redirect.o) + } }.name("TC-From-Iteration") (resolvedRedirects, resolvedRedirects) } @@ -136,8 +140,8 @@ class TCTest(mode: TestExecutionMode) extends MultipleProgramsTestBase(mode) { val tc = triples .iterateDelta(triples, 10, Array("s", "o"))(iterate) .name("Final-TC") -// .map { cl => cl} -// .name("Final-Redirect-Result") + // .map { cl => cl} + // .name("Final-Redirect-Result") tc.map(t => (t.s, t.o)) } @@ -158,19 +162,19 @@ class TCTest(mode: TestExecutionMode) extends MultipleProgramsTestBase(mode) { .join(tuples).where(1).equalTo(0)( new RichJoinFunction[(String, String), (String, String), (String, String)] { override def join(left: (String, String), right: (String, String)): (String, String) = { -// val context = getIterationRuntimeContext -// println("Iteration #" + context.getSuperstepNumber) -// println(context.getIndexOfThisSubtask + "/" + context.getNumberOfParallelSubtasks) + // val context = getIterationRuntimeContext + // println("Iteration #" + context.getSuperstepNumber) + // println(context.getIndexOfThisSubtask + "/" + context.getNumberOfParallelSubtasks) (left._1, right._2) } } ) -// { -// (left, right) => (left._1, right._2) -// } + // { + // (left, right) => (left._1, right._2) + // } .union(prevPaths) .groupBy(0, 1) - .reduce((l ,r) => l) + .reduce((l, r) => l) val terminate = prevPaths .coGroup(nextPaths) @@ -202,32 +206,32 @@ class TCTest(mode: TestExecutionMode) extends MultipleProgramsTestBase(mode) { val initialSolutionSet = tuples val initialWorkset = tuples -// val res = initialSolutionSet.iterateDelta(initialWorkset, maxIterations, Array(keyPosition)) { -// (solution, workset) => -// val deltas = workset.join(solution).where(1).equalTo(0){ -// (prev, next, out: Collector[(String, String)]) => { -// val prevPaths = prev.toSet -// for (n <- next) -// if (!prevPaths.contains(n)) out.collect(n) -// } -// } -// -// val nextWorkset = deltas.filter(new FilterByThreshold()) -// -// (deltas, nextWorkset) -// } -// res + // val res = initialSolutionSet.iterateDelta(initialWorkset, maxIterations, Array(keyPosition)) { + // (solution, workset) => + // val deltas = workset.join(solution).where(1).equalTo(0) { + // (prev, next, out: Collector[(String, String)]) => { + // val prevPaths = prev.toSet + // for (n <- next) + // if (!prevPaths.contains(n)) out.collect(n) + // } + // } + // + // val nextWorkset = deltas.filter(new FilterByThreshold()) + // + // (deltas, nextWorkset) + // } + // res tuples } - def getDataSimple(env: ExecutionEnvironment, scale: Int = 1) : DataSet[RDFTriple] = { + def getDataSimple(env: ExecutionEnvironment, scale: Int = 1): DataSet[RDFTriple] = { val triples = new mutable.HashSet[RDFTriple]() val begin = 1 val end = 10 * scale - for(i <- begin to end) { + for (i <- begin to end) { triples += RDFTriple(ns + "x" + i, p1, ns + "y" + i) triples += RDFTriple(ns + "y" + i, p1, ns + "z" + i) } @@ -235,10 +239,10 @@ class TCTest(mode: TestExecutionMode) extends MultipleProgramsTestBase(mode) { env.fromCollection(triples) } - def getExpectedResultSimple(scale: Int = 1) : String = { + def getExpectedResultSimple(scale: Int = 1): String = { var res = "" - for(i <- 1 to scale * 10) { + for (i <- 1 to scale * 10) { res += s"${ns}x$i,${ns}y$i\n" res += s"${ns}y$i,${ns}z$i\n" res += s"${ns}x$i,${ns}z$i\n" @@ -247,24 +251,24 @@ class TCTest(mode: TestExecutionMode) extends MultipleProgramsTestBase(mode) { res } - def getDataSinglePath(env: ExecutionEnvironment, length: Int = 10) : DataSet[RDFTriple] = { + def getDataSinglePath(env: ExecutionEnvironment, length: Int = 10): DataSet[RDFTriple] = { val triples = new mutable.HashSet[RDFTriple]() // graph is a path of length n // (x1, p, x2), (x2, p, x3), ..., (x(n-1), p, xn) val n = 10 - for(i <- 1 until length) { - triples += RDFTriple(ns + "x" + i, p1, ns + "x" + (i+1)) + for (i <- 1 until length) { + triples += RDFTriple(ns + "x" + i, p1, ns + "x" + (i + 1)) } env.fromCollection(triples) } - def getExpectedResultSinglePath(length: Int = 10) : String = { + def getExpectedResultSinglePath(length: Int = 10): String = { var res = "" - for(i <- 1 to length) { - for(j <- i+1 to length) { + for (i <- 1 to length) { + for (j <- i + 1 to length) { res += s"${ns}x$i,${ns}x${j}\n" } } diff --git a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala index 2c2010f..6e1940c 100644 --- a/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala +++ b/sansa-inference-flink/src/test/scala/net/sansa_stack/inference/flink/rules/TransitivityRuleTest.scala @@ -39,8 +39,8 @@ object TransitivityRuleTest { // graph is a path of length n // (x1, p, x2), (x2, p, x3), ..., (x(n-1), p, xn) val n = 10 - for(i <- 1 to end) { - triples += RDFTriple(ns + "x" + i, p1, ns + "x" + (i+1)) + for (i <- 1 to end) { + triples += RDFTriple(ns + "x" + i, p1, ns + "x" + (i + 1)) } val triplesDataset = env.fromCollection(triples) diff --git a/sansa-inference-spark/pom.xml b/sansa-inference-spark/pom.xml index 87106fa..040a309 100644 --- a/sansa-inference-spark/pom.xml +++ b/sansa-inference-spark/pom.xml @@ -4,12 +4,12 @@ net.sansa-stack sansa-inference-parent_2.11 - 0.3.0 + 0.4.0 ../pom.xml net.sansa-stack sansa-inference-spark_${scala.binary.version} - 0.3.0 + 0.4.0 Inference API - Spark Apache Spark based inference layer for RDF and OWL @@ -33,10 +33,24 @@ - - - - + + ${project.groupId} + sansa-rdf-spark_${scala.binary.version} + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-mapreduce-client-core + + + org.aksw.sparqlify + sparqlify-core + + + @@ -90,6 +104,7 @@ org.apache.jena jena-tdb ${jena.version} + provided @@ -112,11 +127,11 @@ - com.assembla.scala-incubator + org.scala-graph graph-core_${scala.binary.version} - com.assembla.scala-incubator + org.scala-graph graph-dot_${scala.binary.version} @@ -127,9 +142,11 @@ org.jgrapht jgrapht-ext + org.gephi gephi-toolkit + provided @@ -137,16 +154,11 @@ junit junit - - org.scalatest - scalatest_${scala.binary.version} - test - com.holdenkarau spark-testing-base_${scala.binary.version} - 2.2.0_0.8.0 + 2.3.0_0.9.0 test @@ -180,7 +192,7 @@ scala-logging_${scala.binary.version} - + com.github.scopt scopt_${scala.binary.version} @@ -192,6 +204,23 @@ config + + + org.apache.calcite + calcite-core + + + + org.codehaus.janino + janino + + + org.codehaus.janino + commons-compiler + + + + @@ -287,8 +316,8 @@ - unpack - package + resource-dependencies + install unpack-dependencies @@ -297,7 +326,7 @@ sansa-inference-tests_${scala.binary.version} true true - ${project.build.directory}/core-resources + ${project.build.directory}/test-classes org/**,META-INF/**,rebel.xml true true @@ -431,7 +460,7 @@ org.codehaus.janino:* org.codehaus.jettison:jettison org.fusesource.leveldbjni:leveldbjni-all - org.glassfish.hk2* + org.glassfish.hk2* org.glassfish.jersey* org.javassist:javassist org.json4s:json4s* @@ -448,6 +477,8 @@ org.gephi:* org.jfree:* com.itextpdf:* + org.apache.poi:* + org.apache.batik:* @@ -468,6 +499,18 @@ ** + + xerces:xercesImpl + + ** + + + + org.aksw.jena-sparql-api:* + + ** + + dist-${project.artifactId}-${project.version} @@ -586,7 +629,7 @@ commons-logging:commons-logging commons-net:commons-net io.dropwizard.metrics:metrics* - io.netty:netty* + io.netty:netty* javax.activation:activation javax.annotation:javax.annotation-api javax.servlet:javax.servlet-api @@ -621,7 +664,7 @@ org.codehaus.janino:* org.codehaus.jettison:jettison org.fusesource.leveldbjni:leveldbjni-all - org.glassfish.hk2* + org.glassfish.hk2* org.glassfish.jersey* org.javassist:javassist org.json4s:json4s* @@ -638,6 +681,9 @@ org.gephi:* org.jfree:* com.itextpdf:* + org.apache.poi:* + org.apache.batik:* + org.xerial:sqlite-jdbc diff --git a/sansa-inference-spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sansa-inference-spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister deleted file mode 100644 index 50b23f2..0000000 --- a/sansa-inference-spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ /dev/null @@ -1,2 +0,0 @@ -net.sansa_stack.inference.spark.data.loader.sql.NTriplesDataSource -net.sansa_stack.inference.spark.data.loader.sql.TurtleDataSource \ No newline at end of file diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/RDFGraphLoader.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/RDFGraphLoader.scala index e525556..6e85e96 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/RDFGraphLoader.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/RDFGraphLoader.scala @@ -7,13 +7,15 @@ import net.sansa_stack.inference.spark.data.model.{RDFGraph, RDFGraphDataFrame, import net.sansa_stack.inference.utils.NTriplesStringToJenaTriple import org.apache.jena.graph.Triple import org.apache.jena.riot.Lang -import org.apache.spark.sql.{Dataset, SaveMode, SparkSession} +import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} import org.apache.spark.{SparkConf, SparkContext} import org.slf4j.LoggerFactory import scala.language.implicitConversions import org.apache.jena.vocabulary.RDF +import net.sansa_stack.rdf.spark.io.NTripleReader + /** * A class that provides methods to load an RDF graph from disk. * @@ -37,16 +39,7 @@ object RDFGraphLoader { * @return an RDF graph */ def loadFromDisk(session: SparkSession, path: String, minPartitions: Int = 2): RDFGraph = { - logger.info("loading triples from disk...") - val startTime = System.currentTimeMillis() - - val triples = session.sparkContext - .textFile(path, minPartitions) // read the text file - .map(new NTriplesStringToJenaTriple()) // convert to triple object -// .repartition(minPartitions) - -// logger.info("finished loading " + triples.count() + " triples in " + (System.currentTimeMillis()-startTime) + "ms.") - RDFGraph(triples) + RDFGraph(NTripleReader.load(session, path)) } /** @@ -84,18 +77,7 @@ object RDFGraphLoader { * @return an RDF graph */ def loadFromDiskAsRDD(session: SparkSession, path: String, minPartitions: Int): RDFGraphNative = { - logger.info("loading triples from disk...") - val startTime = System.currentTimeMillis() - - val converter = new NTriplesStringToJenaTriple() - - val triples = session.sparkContext - .textFile(path, minPartitions) // read the text file - .map(line => converter.apply(line)) // convert to triple object - - // logger.info("finished loading " + triples.count() + " triples in " + - // (System.currentTimeMillis()-startTime) + "ms.") - new RDFGraphNative(triples) + new RDFGraphNative(NTripleReader.load(session, path)) } private case class RDFTriple2(s: String, p: String, o: String) extends Product3[String, String, String] { @@ -127,15 +109,12 @@ object RDFGraphLoader { Array(splitted(0), splitted(1), splitted(2)) }) - implicit val rdfTripleEncoder = org.apache.spark.sql.Encoders.kryo[Triple] + implicit val rdfTripleEncoder: Encoder[Triple] = org.apache.spark.sql.Encoders.kryo[Triple] val spark = session.sqlContext - - val triples = session.read - .textFile(path) // read the text file - .map(new NTriplesStringToJenaTriple()) - .as[Triple](rdfTripleEncoder) + val triples = session + .createDataset(NTripleReader.load(session, path))(rdfTripleEncoder) .as("triples") // (rdfTripleEncoder) // val rowRDD = session.sparkContext @@ -195,7 +174,7 @@ object RDFGraphLoader { * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]]) * @return an RDF graph based on a [[org.apache.spark.sql.DataFrame]] */ - def loadFromDiskAsDataFrame(session: SparkSession, path: String, minPartitions: Int, sqlSchema: SQLSchema = SQLSchemaDefault): RDFGraphDataFrame = { + def loadFromDiskAsDataFrame(session: SparkSession, path: String, minPartitions: Int = 4, sqlSchema: SQLSchema = SQLSchemaDefault): RDFGraphDataFrame = { val df = session .read .format("net.sansa_stack.inference.spark.data.loader.sql") @@ -208,7 +187,7 @@ object RDFGraphLoader { } def main(args: Array[String]): Unit = { - import net.sansa_stack.inference.spark.data.loader.sql.rdf._ + import net.sansa_stack.rdf.spark.io._ val path = args(0) val lang = args(1) match { @@ -247,9 +226,7 @@ object RDFGraphLoader { - import net.sansa_stack.inference.spark.data.loader.rdd.rdf._ - - val triplesRDD = session.sparkContext.rdf(lang)(path) + val triplesRDD = session.rdf(lang)(path) triples.show(10) println(triples.count()) triplesRDD diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/package.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/package.scala deleted file mode 100644 index a960d5c..0000000 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/package.scala +++ /dev/null @@ -1,127 +0,0 @@ -package net.sansa_stack.inference.spark.data - -import com.typesafe.config.{Config, ConfigFactory} -import org.apache.hadoop.fs.Path -import org.apache.jena.graph.Triple -import org.apache.jena.riot.Lang -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, DataFrameReader, DataFrameWriter, SaveMode} - -import net.sansa_stack.inference.utils.{JenaTripleToNTripleString, Logging, NTriplesStringToJenaTriple} - -package object rdf { - - /** - * The mode for parsing N-Triples. - */ - object ParseMode extends Enumeration { - type ParseMode = Value - val REGEX, SPLIT, JENA = Value - } - - // the DataFrame methods - - /** - * Adds methods, `ntriples` and `turtle`, to DataFrameWriter that allows to write N-Triples and Turtle files from a - * [[DataFrame]] using the `DataFrameWriter` - */ - implicit class RDFDataFrameWriter[T](writer: DataFrameWriter[T]) { - def rdf: String => Unit = writer.format("ntriples").save - def ntriples: String => Unit = writer.format("ntriples").save - } - - /** - * Adds methods, `rdf`, `ntriples` and `turtle`, to DataFrameReader that allows to read N-Triples and Turtle files using - * the `DataFrameReader` - */ - implicit class RDFDataFrameReader(reader: DataFrameReader) extends Logging { - @transient lazy val conf: Config = ConfigFactory.load("rdf_loader") - /** - * Load RDF data into a `DataFrame`. Currently, only N-Triples and Turtle syntax are supported - * @param lang the RDF language (Turtle or N-Triples) - * @return a `DataFrame[(String, String, String)]` - */ - def rdf(lang: Lang): String => DataFrame = lang match { - case i if lang == Lang.NTRIPLES => ntriples - case j if lang == Lang.TURTLE => turtle - case _ => throw new IllegalArgumentException(s"${lang.getLabel} syntax not supported yet!") - } - /** - * Load RDF data in N-Triples syntax into a `DataFrame` with columns `s`, `p`, and `o`. - * @return a `DataFrame[(String, String, String)]` - */ - def ntriples: String => DataFrame = { - log.debug(s"Parsing N-Triples with ${conf.getString("rdf.ntriples.parser")} ...") - reader.format("ntriples").load - } - /** - * Load RDF data in Turtle syntax into a `DataFrame` with columns `s`, `p`, and `o`. - * @return a `DataFrame[(String, String, String)]` - */ - def turtle: String => DataFrame = reader.format("turtle").load - } - - - // the RDD methods - - /** - * Adds methods, `ntriples` and `turtle`, to SparkContext that allows to write N-Triples and Turtle files - */ - implicit class RDFWriter[T](triples: RDD[Triple]) { - - val converter = new JenaTripleToNTripleString() - - def saveAsNTriplesFile(path: String, mode: SaveMode = SaveMode.ErrorIfExists): Unit = { - - val fsPath = new Path(path) - val fs = fsPath.getFileSystem(triples.sparkContext.hadoopConfiguration) - - mode match { - case SaveMode.Append => sys.error("Append mode is not supported by " + this.getClass.getCanonicalName); sys.exit(1) - case SaveMode.Overwrite => fs.delete(fsPath, true) - case SaveMode.ErrorIfExists => sys.error("Given path: " + path + " already exists!!"); sys.exit(1) - case SaveMode.Ignore => sys.exit() - } - - triples - .map(converter) // map to N-Triples string - .saveAsTextFile(path) - } - - } - - /** - * Adds methods, `rdf`, `ntriples` and `turtle`, to SparkContext that allows to read N-Triples and Turtle files - */ - implicit class RDFReader(sc: SparkContext) { - /** - * Load RDF data into an `RDD[Triple]`. Currently, only N-Triples and Turtle syntax are supported - * @param lang the RDF language (Turtle or N-Triples) - * @return the RDD - */ - def rdf(lang: Lang): String => RDD[Triple] = lang match { - case i if lang == Lang.NTRIPLES => ntriples - case j if lang == Lang.TURTLE => turtle - case _ => throw new IllegalArgumentException(s"${lang.getLabel} syntax not supported yet!") - } - - /** - * Load RDF data in N-Triples syntax into an `RDD[Triple]` - * @return the RDD - */ - def ntriples: String => RDD[Triple] = path => - sc - .textFile(path, 4) // read the text file - .map(new NTriplesStringToJenaTriple()) - - /** - * Load RDF data in Turtle syntax into an `RDD[Triple]` - * @return the RDD - */ - def turtle: String => RDD[Triple] = path => - sc - .textFile(path, 4) // read the text file - .map(new NTriplesStringToJenaTriple()) - } -} \ No newline at end of file diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/rdd/package.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/rdd/package.scala deleted file mode 100644 index 055b499..0000000 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/rdd/package.scala +++ /dev/null @@ -1,73 +0,0 @@ -package net.sansa_stack.inference.spark.data.loader.rdd - -import org.apache.hadoop.fs.Path - -import net.sansa_stack.inference.utils.{JenaTripleToNTripleString, NTriplesStringToJenaTriple} -import org.apache.jena.graph.Triple -import org.apache.jena.riot.Lang -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrameWriter, SaveMode} - -package object rdf { - - /** - * Adds methods, `ntriples` and `turtle`, to [[RDD]] that allows to write N-Triples and Turtle files - */ - implicit class RDFWriter[T](triples: RDD[Triple]) { - - val converter = new JenaTripleToNTripleString() - - def saveAsNTriplesFile(path: String, mode: SaveMode = SaveMode.ErrorIfExists): Unit = { - - val fsPath = new Path(path) - val fs = fsPath.getFileSystem(triples.sparkContext.hadoopConfiguration) - - mode match { - case SaveMode.Append => sys.error("Append mode is not supported by " + this.getClass.getCanonicalName); sys.exit(1) - case SaveMode.Overwrite => fs.delete(fsPath, true) - case SaveMode.ErrorIfExists => sys.error("Given path: " + path + " already exists!!"); sys.exit(1) - case SaveMode.Ignore => sys.exit() - } - - triples - .map(converter) // map to N-Triples string - .saveAsTextFile(path) - } - - } - - /** - * Adds methods, `rdf`, `ntriples` and `turtle`, to [[SparkContext]] that allows to read N-Triples and Turtle files - */ - implicit class RDFReader(sc: SparkContext) { - /** - * Load RDF data into an `RDD[Triple]`. Currently, only N-Triples and Turtle syntax are supported - * @param lang the RDF language (Turtle or N-Triples) - * @return the RDD - */ - def rdf(lang: Lang): String => RDD[Triple] = lang match { - case i if lang == Lang.NTRIPLES => ntriples - case j if lang == Lang.TURTLE => turtle - case _ => throw new IllegalArgumentException(s"${lang.getLabel} syntax not supported yet!") - } - - /** - * Load RDF data in N-Triples syntax into an `RDD[Triple]` - * @return the RDD - */ - def ntriples: String => RDD[Triple] = path => - sc - .textFile(path, 4) // read the text file - .map(new NTriplesStringToJenaTriple()) - - /** - * Load RDF data in Turtle syntax into an `RDD[Triple]` - * @return the RDD - */ - def turtle: String => RDD[Triple] = path => - sc - .textFile(path, 4) // read the text file - .map(new NTriplesStringToJenaTriple()) - } -} \ No newline at end of file diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/DefaultSource.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/DefaultSource.scala deleted file mode 100644 index ea697b2..0000000 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/DefaultSource.scala +++ /dev/null @@ -1,19 +0,0 @@ -package net.sansa_stack.inference.spark.data.loader.sql - -import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} -import org.apache.spark.sql.types.StructType - - -class DefaultSource extends RelationProvider with SchemaRelationProvider { - override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) - : BaseRelation = { - createRelation(sqlContext, parameters, null) - } - override def createRelation(sqlContext: SQLContext, parameters: Map[String, String] - , schema: StructType) - : BaseRelation = { - parameters.getOrElse("path", sys.error("'path' must be specified for our data.")) - return new NTriplesRelation(parameters.get("path").get, schema)(sqlContext) - } - } \ No newline at end of file diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/NTriplesDataSource.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/NTriplesDataSource.scala deleted file mode 100644 index 3492540..0000000 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/NTriplesDataSource.scala +++ /dev/null @@ -1,60 +0,0 @@ -package net.sansa_stack.inference.spark.data.loader.sql - -import com.typesafe.config.{Config, ConfigFactory} -import net.sansa_stack.inference.spark.data.rdf.ParseMode -import org.apache.hadoop.fs.Path -import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} - -/** - * The data source for handling N-Triples, i.e. reading from and writing to disk. - * - * @author Lorenz Buehmann - */ -class NTriplesDataSource - extends DataSourceRegister - with RelationProvider - with SchemaRelationProvider - with CreatableRelationProvider { - - lazy val conf: Config = ConfigFactory.load("rdf_loader") - - override def shortName(): String = "ntriples" - - // Used for reading from file without a given schema - override def createRelation(sqlContext: SQLContext, - parameters: Map[String, String]): BaseRelation = - new NTriplesRelation(parameters("path"), null, ParseMode.withName(conf.getString("rdf.ntriples.parser").toUpperCase))(sqlContext) - - // Used for reading from file with a given schema - override def createRelation(sqlContext: SQLContext, - parameters: Map[String, String], - schema: StructType): BaseRelation = - new NTriplesRelation(parameters("path"), schema, ParseMode.withName(conf.getString("rdf.ntriples.parser").toUpperCase))(sqlContext) - - // Used for writing to disk - override def createRelation(sqlContext: SQLContext, - mode: SaveMode, - parameters: Map[String, String], - data: DataFrame): BaseRelation = { - val path = parameters.getOrElse("path", "./output/") // can throw an exception/error, it's just for this tutorial - val fsPath = new Path(path) - val fs = fsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) - - mode match { - case SaveMode.Append => sys.error("Append mode is not supported by " + this.getClass.getCanonicalName); sys.exit(1) - case SaveMode.Overwrite => fs.delete(fsPath, true) - case SaveMode.ErrorIfExists => sys.error("Given path: " + path + " already exists!!"); sys.exit(1) - case SaveMode.Ignore => sys.exit() - } - - val ntriplesRDD = data.rdd.map(row => { - row.toSeq.map(value => value.toString).mkString(" ") + " ." - }) - - ntriplesRDD.saveAsTextFile(path) - - createRelation(sqlContext, parameters, data.schema) - } -} diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/NTriplesRelation.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/NTriplesRelation.scala deleted file mode 100644 index db8c03d..0000000 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/NTriplesRelation.scala +++ /dev/null @@ -1,223 +0,0 @@ -package net.sansa_stack.inference.spark.data.loader.sql - -import java.io.ByteArrayInputStream -import java.util.regex.Pattern - -import net.sansa_stack.inference.spark.data.rdf.ParseMode.{ParseMode, _} -import net.sansa_stack.inference.utils.Logging -import org.apache.jena.graph.Node -import org.apache.jena.riot.lang.LangNTriples -import org.apache.jena.riot.system.RiotLib -import org.apache.jena.riot.tokens.{Tokenizer, TokenizerFactory} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.sources.{BaseRelation, PrunedScan, TableScan} -import org.apache.spark.sql.types.{StringType, StructField, StructType} -import org.apache.spark.sql.{Row, SQLContext} - -import scala.util.{Failure, Success, Try} - -/** - * A custom relation that represents N-Triples. - * - * @param location - * @param userSchema - * @param sqlContext - * @param mode how to parse each line in the N-Triples file (DEFAULT: [[ParseMode]].`REGEX`) - */ -class NTriplesRelation(location: String, userSchema: StructType, val mode: ParseMode = REGEX) - (@transient val sqlContext: SQLContext) - extends BaseRelation - with TableScan - with PrunedScan - with Serializable - with Logging { - - /** - * Whether to skip blank lines or throw an exception. - */ - val skipBlankLines = true - - override def schema: StructType = { - if (this.userSchema != null) { - this.userSchema - } - else { - StructType( - Seq( - StructField("s", StringType, nullable = true), - StructField("p", StringType, nullable = true), - StructField("o", StringType, nullable = true) - )) - } - } - - override def buildScan(): RDD[Row] = { - val rdd = sqlContext - .sparkContext - .textFile(location) - - val rows = mode match { - case REGEX => rdd.map(line => Row.fromTuple(parseRegexPattern(line))) - case SPLIT => rdd.map(line => Row.fromSeq(line.split(" ").toList)) - case JENA => rdd.map(parseJena(_).get).map(t => Row.fromSeq(Seq(t.getSubject.toString, t.getPredicate.toString, t.getObject.toString))) - } - rows - } - - // scan with column pruning - override def buildScan(requiredColumns: Array[String]): RDD[Row] = { - // load the RDD of lines first - val rdd = sqlContext - .sparkContext - .textFile(location) - - // map column names to positions in triple - implicit val positions = requiredColumns.map( - { - case "s" => 1 - case "p" => 2 - case "o" => 3 - } - ) - - // apply different line processing based on the configured parsing mode - val tuples = mode match { - case REGEX => rdd.map(line => { - val tripleOpt = parseRegexPattern(line) - if(tripleOpt.isDefined) { - Some(extractFromTriple(tripleOpt.get)) - } else { - None - } - }) - case SPLIT => rdd.map(line => Some(extractFromTriple(parseRegexSplit(line)))) - case JENA => rdd.map(line => Some(extractFromJenaTriple(parseJena(line).get).map(_.toString))) - } - - val rows = tuples.flatMap(t => { - if (t.isDefined) { - Some(Row.fromSeq(t.get)) - } else { - // TODO error handling - None - } - }) - - rows - } - - private def extractFromTriple(triple: (String, String, String))(implicit positions: Array[Int]): Seq[String] = { - positions.map({ - case 1 => triple._1 - case 2 => triple._2 - case 3 => triple._3 - }).toSeq - } - - private def extractFromJenaTriple(triple: org.apache.jena.graph.Triple)(implicit positions: Array[Int]): Seq[Node] = { - positions.map({ - case 1 => triple.getSubject - case 2 => triple.getPredicate - case 3 => triple.getObject - }).toSeq - } - - /** - * Parse with Jena API - * @param s - * @return - */ - private def parseJena(s: String): Try[org.apache.jena.graph.Triple] = { - // always close the streams - cleanly(new ByteArrayInputStream(s.getBytes))(_.close()) { is => - val profile = RiotLib.dftProfile - val tokenizer: Tokenizer = TokenizerFactory.makeTokenizerUTF8(is) - val parser = new LangNTriples(tokenizer, profile, null) - parser.next() - } - } - - // the REGEX pattern for N-Triples - val pattern: Pattern = Pattern.compile( - """|^ - |(<([^>]*)>|(?]+)(?)) - |\s* - |<([^>]+)> - |\s* - |(<([^>]+)>|(.*)) - |\s*[.]\s*(#.*)?$ - """.stripMargin.replaceAll("\n", "").trim) - - /** - * Parse with REGEX pattern - * @param s - * @return - */ - private def parseRegexPattern(s: String): Option[(String, String, String)] = { - // skip blank lines - if (s.trim.isEmpty) { - None - } else { - - val matcher = pattern.matcher(s) - -// println(matcher.matches() + "---" + s) - - if (matcher.matches) { - // for(i <- 0 to matcher.groupCount()) - // println(i + ":" + matcher.group(i)) - - // parse the subject - val subject = if (matcher.group(2) == null) { // this means it's a blank node captured in group 1 (or 3) - matcher.group(1) - } else { // it is a URI - matcher.group(2) - } - - // parse the predicate - val predicate = matcher.group(4) - - // parse the object - val obj = if (matcher.group(6) == null) { // this means it is a literal - matcher.group(7).trim - } else { // it is a URI - matcher.group(6) - } - - Some((subject, predicate, obj)) - } else { - throw new Exception(s"WARN: Illegal N-Triples syntax. Ignoring triple $s") - } - } - } - - /** - * Parse with simple split on whitespace characters and replace <, >, and . chars - * @param s - * @return - */ - private def parseRegexSplit(s: String): (String, String, String) = { - val s1 = s.trim - val split = s1.substring(0, s1.lastIndexOf('.')).split("\\s", 3) - var obj = split(2).trim - obj = obj.substring(0, obj.lastIndexOf('.')) - (split(0), split(1), obj) - } - - private def cleanly[A, B](resource: A)(cleanup: A => Unit)(doWork: A => B): Try[B] = { - try { - Success(doWork(resource)) - } catch { - case e: Exception => Failure(e) - } - finally { - try { - if (resource != null) { - cleanup(resource) - } - } catch { - case e: Exception => log.error(e.getMessage) // should be logged - } - } - } -} \ No newline at end of file diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/TurtleDataSource.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/TurtleDataSource.scala deleted file mode 100644 index 74178cc..0000000 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/TurtleDataSource.scala +++ /dev/null @@ -1,25 +0,0 @@ -package net.sansa_stack.inference.spark.data.loader.sql - -import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider, SchemaRelationProvider} -import org.apache.spark.sql.types.StructType - -/** - * @author Lorenz Buehmann - */ -class TurtleDataSource - extends DataSourceRegister - with RelationProvider - with SchemaRelationProvider { - - override def shortName(): String = "turtle" - - override def createRelation(sqlContext: SQLContext, - parameters: Map[String, String]): BaseRelation = - new TurtleRelation(parameters("path"), null)(sqlContext) - - override def createRelation(sqlContext: SQLContext, - parameters: Map[String, String], - schema: StructType): BaseRelation = - new TurtleRelation(parameters("path"), schema)(sqlContext) -} diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/TurtleRelation.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/TurtleRelation.scala deleted file mode 100644 index c080b09..0000000 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/TurtleRelation.scala +++ /dev/null @@ -1,133 +0,0 @@ -package net.sansa_stack.inference.spark.data.loader.sql - -import java.io.ByteArrayInputStream - -import org.apache.hadoop.io.{LongWritable, Text} -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat -import org.apache.jena.riot.{Lang, RDFDataMgr} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.sources.{BaseRelation, PrunedScan, TableScan} -import org.apache.spark.sql.types.{StringType, StructField, StructType} -import org.apache.spark.sql.{Row, SQLContext} - -import scala.util.{Failure, Success, Try} - -/** - * A custom relation that represents RDF triples loaded from files in Turtle syntax. - * - * @param location - * @param userSchema - * @param sqlContext - */ -class TurtleRelation(location: String, userSchema: StructType) - (@transient val sqlContext: SQLContext) - extends BaseRelation - with TableScan - with PrunedScan - with Serializable { - - override def schema: StructType = { - if (this.userSchema != null) { - this.userSchema - } - else { - StructType( - Seq( - StructField("s", StringType, true), - StructField("p", StringType, true), - StructField("o", StringType, true) - )) - } - } - - - import scala.collection.JavaConverters._ - - override def buildScan(): RDD[Row] = { - - val confHadoop = new org.apache.hadoop.mapreduce.Job().getConfiguration - confHadoop.set("textinputformat.record.delimiter", ".\n") - - // 1. parse the Turtle file into an RDD[String] with each entry containing a full Turtle snippet - val turtleRDD = sqlContext.sparkContext.newAPIHadoopFile( - location, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], confHadoop) - .filter(!_._2.toString.trim.isEmpty) - .map{ case (_, v) => v.toString } - -// turtleRDD.collect().foreach(chunk => println("Chunk" + chunk)) - - // 2. we need the prefixes - two options: - // a) assume that all prefixes occur in the beginning of the document - // b) filter all lines that contain the prefixes - val prefixes = turtleRDD.filter(_.startsWith("@prefix")) - - // we broadcast the prefixes - val prefixesBC = sqlContext.sparkContext.broadcast(prefixes.collect()) - - // use the Jena Turtle parser to get the triples - val rows = turtleRDD.flatMap(ttl => { - cleanly(new ByteArrayInputStream((prefixesBC.value.mkString("\n") + ttl).getBytes))(_.close()) { is => - // parse the text snippet with Jena - val iter = RDFDataMgr.createIteratorTriples(is, Lang.TURTLE, null).asScala - - iter.map(t => Row.fromTuple((t.getSubject.toString, t.getPredicate.toString, t.getObject.toString))).toSeq - }.get - - }) - - rows - } - - override def buildScan(requiredColumns: Array[String]): RDD[Row] = { - val confHadoop = new org.apache.hadoop.mapreduce.Job().getConfiguration - confHadoop.set("textinputformat.record.delimiter", ".\n") - - // 1. parse the Turtle file into an RDD[String] with each entry containing a full Turtle snippet - val turtleRDD = sqlContext.sparkContext.newAPIHadoopFile( - location, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], confHadoop) - .filter(!_._2.toString.trim.isEmpty) - .map{ case (_, v) => v.toString.trim } - -// turtleRDD.collect().foreach(chunk => println("Chunk:" + chunk)) - - // 2. we need the prefixes - two options: - // a) assume that all prefixes occur in the beginning of the document - // b) filter all lines that contain the prefixes - val prefixes = turtleRDD.filter(_.startsWith("@prefix")) - - // we broadcast the prefixes - val prefixesBC = sqlContext.sparkContext.broadcast(prefixes.collect()) - - // use the Jena Turtle parser to get the triples - val rows = turtleRDD.flatMap(ttl => { -// println("snippet:" + prefixesBC.value.mkString("\n") + ttl) - cleanly(new ByteArrayInputStream((prefixesBC.value.mkString("\n") + ttl).getBytes))(_.close()) { is => - // parse the text snippet with Jena - val iter = RDFDataMgr.createIteratorTriples(is, Lang.TURTLE, null).asScala - - iter.map(t => Row.fromTuple((t.getSubject.toString, t.getPredicate.toString, t.getObject.toString))).toSeq - }.get - - }) - - rows - } - - - def cleanly[A, B](resource: A)(cleanup: A => Unit)(doWork: A => B): Try[B] = { - try { - Success(doWork(resource)) - } catch { - case e: Exception => Failure(e) - } - finally { - try { - if (resource != null) { - cleanup(resource) - } - } catch { - case e: Exception => println(e) // should be logged - } - } - } -} \ No newline at end of file diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/package.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/package.scala deleted file mode 100644 index 3cbe8da..0000000 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/package.scala +++ /dev/null @@ -1,53 +0,0 @@ -package net.sansa_stack.inference.spark.data.loader.sql - -import com.typesafe.config.{Config, ConfigFactory} -import net.sansa_stack.inference.utils.Logging -import org.apache.jena.riot.Lang -import org.apache.spark.sql.{DataFrame, DataFrameReader, DataFrameWriter} - -/** - * Wrap implicits to load/write RDF data into/from a [[DataFrame]]. - */ -package object rdf { - - /** - * Adds methods, `ntriples` and `turtle`, to [[DataFrameWriter]] that allows to write N-Triples and Turtle files. - */ - implicit class RDFDataFrameWriter[T](writer: DataFrameWriter[T]) { - def rdf: String => Unit = writer.format("ntriples").save - def ntriples: String => Unit = writer.format("ntriples").save - } - - /** - * Adds methods, `rdf`, `ntriples` and `turtle`, to [[DataFrameReader]] that allows to read N-Triples and Turtle files. - */ - implicit class RDFDataFrameReader(reader: DataFrameReader) extends Logging { - - @transient lazy val conf: Config = ConfigFactory.load("rdf_loader") - - /** - * Load RDF data into a [[DataFrame]]. - * Currently, only N-Triples and Turtle syntax are supported! - * @param lang the RDF language (Turtle or N-Triples) - * @return a `DataFrame[(String, String, String)]` - */ - def rdf(lang: Lang): String => DataFrame = lang match { - case i if lang == Lang.NTRIPLES => ntriples - case j if lang == Lang.TURTLE => turtle - case _ => throw new IllegalArgumentException(s"${lang.getLabel} syntax not supported yet!") - } - /** - * Load RDF data in N-Triples syntax into a [[DataFrame]] with columns `s`, `p`, and `o`. - * @return a [[DataFrame]][(String, String, String)] - */ - def ntriples: String => DataFrame = { - log.debug(s"Parsing N-Triples with ${conf.getString("rdf.ntriples.parser")} ...") - reader.format("ntriples").load - } - /** - * Load RDF data in Turtle syntax into a [[DataFrame]] with columns `s`, `p`, and `o`. - * @return a [[DataFrame]][(String, String, String)] - */ - def turtle: String => DataFrame = reader.format("turtle").load - } -} \ No newline at end of file diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/EmptyRDFGraphDataFrame.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/EmptyRDFGraphDataFrame.scala index 94f419d..5a4f73e 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/EmptyRDFGraphDataFrame.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/EmptyRDFGraphDataFrame.scala @@ -4,6 +4,8 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SQLContext} /** + * Represents an empty RDF graph as Dataframe. + * * @author Lorenz Buehmann */ object EmptyRDFGraphDataFrame { @@ -13,7 +15,7 @@ object EmptyRDFGraphDataFrame { val schemaString = "subject predicate object" // generate the schema based on the string of schema - val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true))) + val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, nullable = true))) // convert triples RDD to rows val rowRDD = sqlContext.sparkContext.emptyRDD[Row] @@ -26,4 +28,4 @@ object EmptyRDFGraphDataFrame { triplesDataFrame } -} \ No newline at end of file +} diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/RDFGraphDataset.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/RDFGraphDataset.scala index 089638a..bc635ce 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/RDFGraphDataset.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/RDFGraphDataset.scala @@ -35,7 +35,7 @@ class RDFGraphDataset(override val triples: Dataset[Triple]) def unionAll(graphs: Seq[RDFGraphDataset]): RDFGraphDataset = { // the Dataframe based solution - return graphs.reduce(_ union _) + graphs.reduce(_ union _) // // to limit the lineage, we convert to RDDs first, and use the SparkContext Union method for a sequence of RDDs // val df: Option[DataFrame] = graphs match { diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/RDFTuple.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/RDFTuple.scala index fae47f7..2e0efea 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/RDFTuple.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/model/RDFTuple.scala @@ -5,10 +5,10 @@ package net.sansa_stack.inference.spark.data.model * * @param s the subject * @param o the object - * * @author Lorenz Buehmann */ case class RDFTuple(s: String, o: String) extends Product2[String, String] { - override def _1: String = s - override def _2: String = o - } \ No newline at end of file + override def _1: String = s + + override def _2: String = o +} diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/AbstractForwardRuleReasonerRDFS.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/AbstractForwardRuleReasonerRDFS.scala index f77e035..e9e6f67 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/AbstractForwardRuleReasonerRDFS.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/AbstractForwardRuleReasonerRDFS.scala @@ -51,14 +51,14 @@ abstract class AbstractForwardRuleReasonerRDFS[Rdf <: RDF, D, G <: AbstractRDFGr // println("others:" + others.size()) /* - rdfs5 xxx rdfs:subPropertyOf yyy . - yyy rdfs:subPropertyOf zzz . xxx rdfs:subPropertyOf zzz . + rdfs5 xxx rdfs:subPropertyOf yyy . + yyy rdfs:subPropertyOf zzz . xxx rdfs:subPropertyOf zzz . */ val r5 = rule5(graph) /* - rdfs7 aaa rdfs:subPropertyOf bbb . - xxx aaa yyy . xxx bbb yyy . + rdfs7 aaa rdfs:subPropertyOf bbb . + xxx aaa yyy . xxx bbb yyy . */ val r7 = rule7(others) others = others.union(r7) @@ -73,8 +73,8 @@ abstract class AbstractForwardRuleReasonerRDFS[Rdf <: RDF, D, G <: AbstractRDFGr val r11 = rule11(graph) /* - rdfs9 xxx rdfs:subClassOf yyy . - zzz rdf:type xxx . zzz rdf:type yyy . + rdfs9 xxx rdfs:subClassOf yyy . + zzz rdf:type xxx . zzz rdf:type yyy . */ val r9 = rule9(types) diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/FixpointIteration.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/FixpointIteration.scala index 9785745..c852c72 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/FixpointIteration.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/FixpointIteration.scala @@ -27,7 +27,7 @@ object FixpointIteration extends Logging { * the termination criterion. The iterations terminate when either the termination criterion * [[RDD]] contains no elements or when `maxIterations` iterations have been performed. * - **/ + */ def apply[T: ClassTag](maxIterations: Int = 10)(rdd: RDD[T], f: RDD[T] => RDD[T]): RDD[T] = { var newRDD = rdd newRDD.cache() @@ -56,7 +56,7 @@ object FixpointIteration extends Logging { * the termination criterion. The iterations terminate when either the termination criterion * RDD contains no elements or when `maxIterations` iterations have been performed. * - **/ + */ def apply2[T: ClassTag](maxIterations: Int = 10)(dataset: Dataset[T], f: Dataset[T] => Dataset[T]): Dataset[T] = { var newDS = dataset newDS.cache() diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasoner.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasoner.scala index c0e1d53..a76720c 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasoner.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasoner.scala @@ -15,6 +15,16 @@ import scala.collection.mutable */ trait ForwardRuleReasoner extends Profiler { + /** + * Applies forward chaining to the given RDD of RDF triples and returns a new + * RDD of RDF triples that contains all additional triples based on the underlying + * set of rules. + * + * @param triples the RDF triples + * @return the materialized set of RDF triples + */ + def apply(triples: RDD[Triple]) : RDD[Triple] = apply(RDFGraph(triples)).triples + /** * Applies forward chaining to the given RDF graph and returns a new RDF graph that contains all additional * triples based on the underlying set of rules. diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerOWLHorst.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerOWLHorst.scala index 03ee540..d6690ca 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerOWLHorst.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerOWLHorst.scala @@ -179,8 +179,8 @@ class ForwardRuleReasonerOWLHorst(sc: SparkContext, parallelism: Int = 2) extend // 2. SubPropertyOf inheritance according to rdfs7 is computed /* - rdfs7 aaa rdfs:subPropertyOf bbb . - xxx aaa yyy . xxx bbb yyy . + rdfs7 aaa rdfs:subPropertyOf bbb . + xxx aaa yyy . xxx bbb yyy . */ val triplesRDFS7 = triplesFiltered @@ -193,8 +193,8 @@ class ForwardRuleReasonerOWLHorst(sc: SparkContext, parallelism: Int = 2) extend // 3. Domain and Range inheritance according to rdfs2 and rdfs3 is computed /* - rdfs2 aaa rdfs:domain xxx . - yyy aaa zzz . yyy rdf:type xxx . + rdfs2 aaa rdfs:domain xxx . + yyy aaa zzz . yyy rdf:type xxx . */ val triplesRDFS2 = rdfs7Res @@ -202,8 +202,8 @@ class ForwardRuleReasonerOWLHorst(sc: SparkContext, parallelism: Int = 2) extend .map(t => Triple.create(t.s, RDF.`type`.asNode, domainMapBC.value(t.p))) /* - rdfs3 aaa rdfs:range xxx . - yyy aaa zzz . zzz rdf:type xxx . + rdfs3 aaa rdfs:range xxx . + yyy aaa zzz . zzz rdf:type xxx . */ val triplesRDFS3 = rdfs7Res @@ -215,8 +215,8 @@ class ForwardRuleReasonerOWLHorst(sc: SparkContext, parallelism: Int = 2) extend // input are the rdf:type triples from RDFS2/RDFS3 and the ones contained in the original graph /* - rdfs9 xxx rdfs:subClassOf yyy . - zzz rdf:type xxx . zzz rdf:type yyy . + rdfs9 xxx rdfs:subClassOf yyy . + zzz rdf:type xxx . zzz rdf:type yyy . */ val triplesRDFS9 = triplesRDFS2 diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerRDFS.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerRDFS.scala index 5aa4984..65f98fb 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerRDFS.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerRDFS.scala @@ -10,9 +10,10 @@ import org.apache.jena.graph.Triple import org.apache.jena.vocabulary.{RDF, RDFS} import org.apache.spark.SparkContext import org.slf4j.LoggerFactory - import scala.collection.mutable +import org.apache.spark.rdd.RDD + /** * A forward chaining implementation of the RDFS entailment regime. * @@ -39,8 +40,10 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr // as an optimization, we can extract all schema triples first which avoids to run on the whole dataset // for each schema triple later - val schemaTriples = if (extractSchemaTriplesInAdvance) new RDFSSchemaExtractor().extract(triplesRDD) + val schemaTriples = if (extractSchemaTriplesInAdvance) new RDFSSchemaExtractor().extract(triplesRDD).cache() else triplesRDD + schemaTriples.setName("schema triples") +// println(s"#schema: ${schemaTriples.count()}") // 1. we first compute the transitive closure of rdfs:subPropertyOf and rdfs:subClassOf @@ -49,14 +52,14 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr * rdfs11 xxx rdfs:subClassOf yyy . * yyy rdfs:subClassOf zzz . xxx rdfs:subClassOf zzz . */ - val subClassOfTriples = extractTriples(schemaTriples, RDFS.subClassOf.asNode()) // extract rdfs:subClassOf triples + val subClassOfTriples = extractTriples(schemaTriples, RDFS.subClassOf.asNode()).cache() // extract rdfs:subClassOf triples val subClassOfTriplesTrans = computeTransitiveClosure(subClassOfTriples, RDFS.subClassOf.asNode()).setName("rdfs11")// mutable.Set()++subClassOfTriples.collect()) /* - rdfs5 xxx rdfs:subPropertyOf yyy . - yyy rdfs:subPropertyOf zzz . xxx rdfs:subPropertyOf zzz . + rdfs5 xxx rdfs:subPropertyOf yyy . + yyy rdfs:subPropertyOf zzz . xxx rdfs:subPropertyOf zzz . */ - val subPropertyOfTriples = extractTriples(schemaTriples, RDFS.subPropertyOf.asNode()) // extract rdfs:subPropertyOf triples + val subPropertyOfTriples = extractTriples(schemaTriples, RDFS.subPropertyOf.asNode()).cache() // extract rdfs:subPropertyOf triples val subPropertyOfTriplesTrans = computeTransitiveClosure(subPropertyOfTriples, RDFS.subPropertyOf.asNode()).setName("rdfs5")// extractTriples(mutable.Set()++subPropertyOfTriples.collect(), RDFS.subPropertyOf.getURI)) // a map structure should be more efficient @@ -71,7 +74,9 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr // split by rdf:type val split = triplesRDD.partitionBy(t => t.p == RDF.`type`.asNode) var typeTriples = split._1 + typeTriples.setName("rdf:type triples") var otherTriples = split._2 + otherTriples.setName("other triples") // val formatter = java.text.NumberFormat.getIntegerInstance // println("triples" + formatter.format(triplesRDD.count())) @@ -81,8 +86,8 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr // 2. SubPropertyOf inheritance according to rdfs7 is computed /* - rdfs7 aaa rdfs:subPropertyOf bbb . - xxx aaa yyy . xxx bbb yyy . + rdfs7 aaa rdfs:subPropertyOf bbb . + xxx aaa yyy . xxx bbb yyy . */ val triplesRDFS7 = otherTriples // all triples (s p1 o) @@ -92,13 +97,13 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr .setName("rdfs7") // add triples - otherTriples = otherTriples.union(triplesRDFS7) + otherTriples = otherTriples.union(triplesRDFS7).setName("other triples with rdfs7") // 3. Domain and Range inheritance according to rdfs2 and rdfs3 is computed /* - rdfs2 aaa rdfs:domain xxx . - yyy aaa zzz . yyy rdf:type xxx . + rdfs2 aaa rdfs:domain xxx . + yyy aaa zzz . yyy rdf:type xxx . */ val domainTriples = extractTriples(schemaTriples, RDFS.domain.asNode()) val domainMap = domainTriples.map(t => (t.s, t.o)).collect.toMap @@ -111,8 +116,8 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr .setName("rdfs2") /* - rdfs3 aaa rdfs:range xxx . - yyy aaa zzz . zzz rdf:type xxx . + rdfs3 aaa rdfs:range xxx . + yyy aaa zzz . zzz rdf:type xxx . */ val rangeTriples = extractTriples(schemaTriples, RDFS.range.asNode()) val rangeMap = rangeTriples.map(t => (t.s, t.o)).collect().toMap @@ -125,16 +130,16 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr .setName("rdfs3") // rdfs2 and rdfs3 generated rdf:type triples which we'll add to the existing ones - val triples23 = triplesRDFS2.union(triplesRDFS3) + val triples23 = triplesRDFS2.union(triplesRDFS3).setName("rdfs2 + rdfs3") // all rdf:type triples here as intermediate result - typeTriples = typeTriples.union(triples23) + typeTriples = typeTriples.union(triples23).setName("rdf:type + rdfs2 + rdfs3") // 4. SubClass inheritance according to rdfs9 /* - rdfs9 xxx rdfs:subClassOf yyy . - zzz rdf:type xxx . zzz rdf:type yyy . + rdfs9 xxx rdfs:subClassOf yyy . + zzz rdf:type xxx . zzz rdf:type yyy . */ val triplesRDFS9 = typeTriples // all rdf:type triples (s a A) @@ -168,8 +173,9 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr subClassOfTriplesTrans, subPropertyOfTriplesTrans, typeTriples, - triplesRDFS7, +// triplesRDFS7, triplesRDFS9)) + .setName("rdf:type + other + rdfs2 + rdfs3 + rdfs5 + rdfs7 + rdfs9 + rdfs11") .distinct(parallelism) // we perform also additional rules if enabled @@ -180,7 +186,7 @@ class ForwardRuleReasonerRDFS(sc: SparkContext, parallelism: Int = 2) extends Tr // rdfs4a: (s p o) => (s rdf:type rdfs:Resource) // rdfs4b: (s p o) => (o rdf:type rdfs:Resource) // filter by literals - // TODO not sure which version is more effcient, using a FILTER + UNION, or doing it via faltMap but creating Set objects + // TODO not sure which version is more efficient, using a FILTER + UNION, or doing it via faltMap but creating Set objects // val rdfs4 = allTriples.map(t => Triple.create(t.s, RDF.`type`.asNode(), RDFS.Resource.asNode())) // .union( // allTriples.filter(!_.getObject.isLiteral).map(t => Triple.create(t.o, RDF.`type`.asNode(), RDFS.Resource.asNode()))) diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerRDFSDataframe.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerRDFSDataframe.scala index 4a23263..d6f41f6 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerRDFSDataframe.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/ForwardRuleReasonerRDFSDataframe.scala @@ -65,8 +65,8 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int = // val checkSubclass = udf((cls: String) => subClassOfMapBC.value.contains(cls)) // val makeSuperTypeTriple = udf((ind: String, cls: String) => (ind, subClassOfMapBC.value(cls))) /* - rdfs5 xxx rdfs:subPropertyOf yyy . - yyy rdfs:subPropertyOf zzz . xxx rdfs:subPropertyOf zzz . + rdfs5 xxx rdfs:subPropertyOf yyy . + yyy rdfs:subPropertyOf zzz . xxx rdfs:subPropertyOf zzz . */ val subPropertyOfTriples = index(RDFS.subPropertyOf.asNode()) // extract rdfs:subPropertyOf triples val subPropertyOfTriplesTrans = broadcast(computeTransitiveClosureDF(subPropertyOfTriples.as[RDFTriple]).toDF().alias("SP")) @@ -95,8 +95,8 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int = // 2. SubPropertyOf inheritance according to rdfs7 is computed /* - rdfs7 aaa rdfs:subPropertyOf bbb . - xxx aaa yyy . xxx bbb yyy . + rdfs7 aaa rdfs:subPropertyOf bbb . + xxx aaa yyy . xxx bbb yyy . */ val triplesRDFS7 = triples // all triples (s p1 o) @@ -117,8 +117,8 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int = // 3. Domain and Range inheritance according to rdfs2 and rdfs3 is computed /* - rdfs2 aaa rdfs:domain xxx . - yyy aaa zzz . yyy rdf:type xxx . + rdfs2 aaa rdfs:domain xxx . + yyy aaa zzz . yyy rdf:type xxx . */ val domainTriples = broadcast(index(RDFS.domain.asNode()).alias("DOM")) @@ -132,8 +132,8 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int = // triplesRDFS2.explain(true) /* - rdfs3 aaa rdfs:range xxx . - yyy aaa zzz . zzz rdf:type xxx . + rdfs3 aaa rdfs:range xxx . + yyy aaa zzz . zzz rdf:type xxx . */ val rangeTriples = broadcast(index(RDFS.range.asNode()).alias("RAN")) @@ -154,8 +154,8 @@ class ForwardRuleReasonerRDFSDataframe(session: SparkSession, parallelism: Int = // 4. SubClass inheritance according to rdfs9 /* - rdfs9 xxx rdfs:subClassOf yyy . - zzz rdf:type xxx . zzz rdf:type yyy . + rdfs9 xxx rdfs:subClassOf yyy . + zzz rdf:type xxx . zzz rdf:type yyy . */ val tuplesRDFS9 = typeTuples .join(subClassOfTriplesTrans, $"TYPES.${sqlSchema.objectCol}" === $"SC.${sqlSchema.subjectCol}", "inner") @@ -289,7 +289,7 @@ object ForwardRuleReasonerRDFSDataframe { def apply(session: SparkSession, parallelism: Int = 2): ForwardRuleReasonerRDFSDataframe = new ForwardRuleReasonerRDFSDataframe(session, parallelism) def main(args: Array[String]): Unit = { - import net.sansa_stack.inference.spark.data.loader.sql.rdf._ + import net.sansa_stack.rdf.spark.io._ val parallelism = 2 @@ -320,4 +320,4 @@ object ForwardRuleReasonerRDFSDataframe { val infGraph = ForwardRuleReasonerRDFSDataframe(session).apply(graph) println(infGraph.size()) } -} \ No newline at end of file +} diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/TransitiveReasoner.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/TransitiveReasoner.scala index 5f0219e..03a4c79 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/TransitiveReasoner.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/forwardchaining/triples/TransitiveReasoner.scala @@ -78,7 +78,7 @@ class TransitiveReasoner(sc: SparkContext, val properties: Seq[Node], val parall private def addTransitive(triples: Set[Triple]): Set[Triple] = { triples ++ ( for (t1 <- triples; t2 <- triples if t1.o == t2.s) - yield Triple.create(t1.s, t1.p, t2.o)) + yield Triple.create(t1.s, t1.p, t2.o)) } /** @@ -101,7 +101,7 @@ class TransitiveReasoner(sc: SparkContext, val properties: Seq[Node], val parall /** * Computes the transitive closure for the given predicate on an RDD of triples. * - * @param triples the RDD of triples + * @param triples the RDD of triples * @param predicate the predicate * @return an RDD containing the transitive closure of the triples */ @@ -139,7 +139,7 @@ class TransitiveReasoner(sc: SparkContext, val properties: Seq[Node], val parall rdd.join(edgesReversed).map(x => (x._2._2, x._2._1)) } -// tc = FixpointIteration(10)(tc, f) + // tc = FixpointIteration(10)(tc, f) // the join is iterated until a fixed point is reached var i = 1 @@ -190,14 +190,14 @@ class TransitiveReasoner(sc: SparkContext, val properties: Seq[Node], val parall // the join is iterated until a fixed point is reached var i = 1 - while(!deltaTC.isEmpty()) { + while (!deltaTC.isEmpty()) { log.info(s"iteration $i...") // perform the join (x, y) x (y, x), obtaining an RDD of (x=y, (y, x)) pairs, // then project the result to obtain the new (x, y) paths. deltaTC = deltaTC.join(edgesReversed) - .map(x => (x._2._2, x._2._1)) - .subtract(tc).distinct().cache() + .map(x => (x._2._2, x._2._1)) + .subtract(tc).distinct().cache() // add to TC tc = tc.union(deltaTC).cache() @@ -217,7 +217,7 @@ class TransitiveReasoner(sc: SparkContext, val properties: Seq[Node], val parall */ def computeTransitiveClosure(edges: Dataset[Triple]): Dataset[Triple] = { log.info("computing TC...") -// implicit val myObjEncoder = org.apache.spark.sql.Encoders.kryo[RDFTriple] + // implicit val myObjEncoder = org.apache.spark.sql.Encoders.kryo[RDFTriple] val spark = edges.sparkSession.sqlContext import spark.implicits._ implicit val myObjEncoder = org.apache.spark.sql.Encoders.kryo[Triple] @@ -242,12 +242,12 @@ class TransitiveReasoner(sc: SparkContext, val properties: Seq[Node], val parall tc.createOrReplaceTempView("SC") var joined = tc.as("A").join(tc.as("B"), $"A.o" === $"B.s").select("A.s", "A.p", "B.o").as[Triple] -// var joined = tc -// .join(edges, tc("o") === edges("s")) -// .select(tc("s"), tc("p"), edges("o")) -// .as[RDFTriple] -// tc.sqlContext. -// sql("SELECT A.subject, A.predicate, B.object FROM SC A INNER JOIN SC B ON A.object = B.subject") + // var joined = tc + // .join(edges, tc("o") === edges("s")) + // .select(tc("s"), tc("p"), edges("o")) + // .as[RDFTriple] + // tc.sqlContext. + // sql("SELECT A.subject, A.predicate, B.object FROM SC A INNER JOIN SC B ON A.object = B.subject") // joined.explain() // var joined = df1.join(df2, df1("object") === df2("subject"), "inner") diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/PrettyDuration.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/PrettyDuration.scala index ea4c1cd..66cde94 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/PrettyDuration.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/PrettyDuration.scala @@ -8,7 +8,7 @@ object PrettyDuration { def pretty: String = pretty(includeNanos = false) - /** Selects most apropriate TimeUnit for given duration and formats it accordingly */ + /** Selects most appropriate TimeUnit for given duration and formats it accordingly */ def pretty(includeNanos: Boolean, precision: Int = 4): String = { require(precision > 0, "precision must be > 0") @@ -48,4 +48,4 @@ object PrettyDuration { } } -} \ No newline at end of file +} diff --git a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/RDFSSchemaExtractor.scala b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/RDFSSchemaExtractor.scala index 8fd3335..91767cb 100644 --- a/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/RDFSSchemaExtractor.scala +++ b/sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/utils/RDFSSchemaExtractor.scala @@ -35,13 +35,10 @@ class RDFSSchemaExtractor() extends Logging with Serializable { * @return the RDF graph containing only the schema triples */ def extract(graph: RDFGraph): RDFGraph = { - log.info("Started schema extraction...") val filteredTriples = graph.triples.filter(t => properties.contains(t.p)) - log.info("Finished schema extraction.") - - new RDFGraph(filteredTriples) + RDFGraph(filteredTriples) } /** @@ -51,11 +48,11 @@ class RDFSSchemaExtractor() extends Logging with Serializable { * @return the schema triples */ def extract(triples: RDD[Triple]): RDD[Triple] = { - log.info("Started schema extraction...") +// log.info("Started schema extraction...") val filteredTriples = triples.filter(t => properties.contains(t.p)) - log.info("Finished schema extraction.") +// log.info("Finished schema extraction.") filteredTriples } diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/DatastructureSerializationPerformanceTests.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/DatastructureSerializationPerformanceTests.scala index cc22c96..5d969f9 100644 --- a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/DatastructureSerializationPerformanceTests.scala +++ b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/DatastructureSerializationPerformanceTests.scala @@ -1,11 +1,13 @@ package net.sansa_stack.inference.spark -import net.sansa_stack.inference.utils.{NTriplesStringToJenaTriple, NTriplesStringToRDFTriple} import org.apache.jena.graph.{Node, Triple} import org.apache.spark.SparkConf -import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerStageCompleted} +import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd} import org.apache.spark.sql.{Encoder, Encoders, Row, SparkSession} -//import org.apache.spark.groupon.metrics.{SparkMeter, SparkTimer, UserMetricsSystem} + +import net.sansa_stack.inference.utils.{NTriplesStringToJenaTriple, NTriplesStringToRDFTriple} +import net.sansa_stack.rdf.spark.io.NTripleReader +// import org.apache.spark.groupon.metrics.{SparkMeter, SparkTimer, UserMetricsSystem} import scala.reflect.ClassTag @@ -30,7 +32,7 @@ object DatastructureSerializationPerformanceTests { conf.registerKryoClasses(Array(classOf[org.apache.jena.graph.Triple], classOf[org.apache.jena.graph.Node])) conf.set("spark.extraListeners", "net.sansa_stack.inference.spark.utils.CustomSparkListener") - val parallelism = 4 + val parallelism = 20 class JobListener extends SparkListener { override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { @@ -40,7 +42,7 @@ object DatastructureSerializationPerformanceTests { // the SPARK config val session = SparkSession.builder - .appName(s"SPARK RDFS Reasoning") + .appName(s"RDF Triple Encoder Performance") .master("local[4]") .config("spark.eventLog.enabled", "true") .config("spark.hadoop.validateOutputSpecs", "false") // override output files @@ -74,10 +76,8 @@ object DatastructureSerializationPerformanceTests { .getOrCreate() - def loadAndDictinctJena(path: String): Unit = { - val triples = session.sparkContext - .textFile(path, 4) // read the text file - .map(new NTriplesStringToJenaTriple()) + def loadAndDistinctJena(path: String): Unit = { + val triples = NTripleReader.load(session, path) triples.cache() @@ -88,11 +88,12 @@ object DatastructureSerializationPerformanceTests { val pair = triples.map(t => (t.getSubject, (t.getPredicate, t.getObject))) // map to PairRDD val joinCount = pair.join(pair).count() - logger.info(distinctCount) - logger.info(joinCount) + logger.info("Jena RDD[Triple]") + logger.info(s"#triples:$distinctCount") + logger.info(s"#joined triples(s-s):$joinCount") } - def loadAndDictinctPlain(path: String): Unit = { + def loadAndDistinctPlain(path: String): Unit = { val triples = session.sparkContext .textFile(path, 4) // read the text file .flatMap(line => new NTriplesStringToRDFTriple().apply(line)) @@ -124,10 +125,9 @@ object DatastructureSerializationPerformanceTests { implicit def tuple3[A1, A2, A3](implicit e1: Encoder[A1], e2: Encoder[A2], e3: Encoder[A3]): Encoder[(A1, A2, A3)] = Encoders.tuple[A1, A2, A3](e1, e2, e3) - val triples = session.sparkContext - .textFile(path, 4) // read the text file - .map(new NTriplesStringToJenaTriple()) - .map(t => (t.getSubject, t.getPredicate, t.getObject)) + val triplesRDD = NTripleReader.load(session, path) + + val tripleNodesRDD = triplesRDD.map(t => (t.getSubject, t.getPredicate, t.getObject)) val conv = new NTriplesStringToJenaTriple() var tripleDS = @@ -136,29 +136,38 @@ object DatastructureSerializationPerformanceTests { // val t = conv.apply(row.getString(0)) // (t.getSubject, t.getPredicate, t.getObject) // }) - session.createDataset(triples) + session.createDataset(tripleNodesRDD) .toDF("s", "p", "o") .as[JenaTripleEncoded] + tripleDS.printSchema() tripleDS.cache() + // show 10 triples + tripleDS.show() + // DISTINCT and COUNT val distinctCount = tripleDS.distinct().count() - // self JOIN on subject and COUNT - val joinCount = tripleDS.alias("A").join(tripleDS.alias("B"), $"A.s" === $"B.s", "inner").count() - logger.info(distinctCount) - logger.info(joinCount) + // self JOIN on subject and COUNT + val triplesA = tripleDS.alias("A") + val triplesB = tripleDS.alias("B") + val triplesJoined = triplesA.joinWith(triplesB, $"A.s" === $"B.s") + val joinCount = triplesJoined.count() + + logger.info("DataFrame[(Node, Node, Node)]") + logger.info(s"#triples:$distinctCount") + logger.info(s"#joined triples(s-s):$joinCount") } def main(args: Array[String]): Unit = { val path = args(0) - - loadAndDictinctJena(path) - - loadAndDictinctPlain(path) +// +// loadAndDistinctJena(path) +// +// loadAndDistinctPlain(path) loadAndDistinctDatasetJena(path) diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/GenericDataframeVsGenericNativeExperiments.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/GenericDataframeVsGenericNativeExperiments.scala index 21a4501..11a3894 100644 --- a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/GenericDataframeVsGenericNativeExperiments.scala +++ b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/GenericDataframeVsGenericNativeExperiments.scala @@ -25,7 +25,7 @@ object GenericDataframeVsGenericNativeExperiments { .appName("GenericDataframe-Vs-GenericNative-Experiments") .master("local[4]") .config("spark.eventLog.enabled", "true") - .config("spark.hadoop.validateOutputSpecs", "false") //override output files + .config("spark.hadoop.validateOutputSpecs", "false") // override output files .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.default.parallelism", "4") .config("spark.sql.shuffle.partitions", "8") @@ -47,7 +47,7 @@ object GenericDataframeVsGenericNativeExperiments { session = sessionBuilder.appName("generic-rdd").getOrCreate() // load triples from disk - var graph = RDFGraphLoader.loadFromDiskAsRDD(session, args(0), 4)//generateData(1) + var graph = RDFGraphLoader.loadFromDiskAsRDD(session, args(0), 4)// generateData(1) val infGraphNative = native(graph) diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/QueryLayerIntegration.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/QueryLayerIntegration.scala deleted file mode 100644 index b073980..0000000 --- a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/QueryLayerIntegration.scala +++ /dev/null @@ -1,51 +0,0 @@ -package net.sansa_stack.inference.spark - -/** - * @author Lorenz Buehmann - */ - -object QueryLayerIntegration { -/* def main(args: Array[String]): Unit = { - val tempDirStr = System.getProperty("java.io.tmpdir") - if(tempDirStr == null) { - throw new RuntimeException("Could not obtain temporary directory") - } - val sparkEventsDir = new File(tempDirStr + "/spark-events") - if(!sparkEventsDir.exists()) { - sparkEventsDir.mkdirs() - } - - val sparkSession = SparkSession.builder - .master("local") - .appName("spark session example") - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - //.config("spark.kryo.registrationRequired", "true") - .config("spark.eventLog.enabled", "true") - .config("spark.kryo.registrator", String.join(", ", - "net.sansa_stack.rdf.spark.io.JenaKryoRegistrator", - "net.sansa_stack.query.spark.sparqlify.KryoRegistratorSparqlify" - )) - .config("spark.default.parallelism", "4") - .config("spark.sql.shuffle.partitions", "4") - .getOrCreate() - - val triplesString = - """ "Guy De" . - | . - | . - | . - | "Charles"@en . - | .""".stripMargin - - val it = RDFDataMgr.createIteratorTriples(IOUtils.toInputStream(triplesString, "UTF-8"), Lang.NTRIPLES, "http://example.org/").asScala.toSeq - //it.foreach { x => println("GOT: " + (if(x.getObject.isLiteral) x.getObject.getLiteralLanguage else "-")) } - val graphRdd = sparkSession.sparkContext.parallelize(it) - - //val map = graphRdd.partitionGraphByPredicates - val partitions = RdfPartitionUtilsSpark.partitionGraph(graphRdd, RdfPartitionerDefault) - - partitions.foreach(p => println(p._1)) - } - -*/ -} diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/loader/RDFLoadingTests.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/loader/RDFLoadingTests.scala index 7c18f8b..9a13a2a 100644 --- a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/loader/RDFLoadingTests.scala +++ b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/loader/RDFLoadingTests.scala @@ -5,13 +5,13 @@ import org.apache.jena.riot.Lang import org.scalatest.FunSuite /** - * Tests for loading triples from either N-Triples are Turtle files into a DataFrame. + * Tests for loading triples from either N-Triples or Turtle files into a DataFrame. * * @author Lorenz Buehmann */ class RDFLoadingTests extends FunSuite with DataFrameSuiteBase { - import net.sansa_stack.inference.spark.data.loader.sql.rdf._ + import net.sansa_stack.rdf.spark.io._ test("loading N-Triples file into DataFrame with REGEX parsing mode should result in 9 triples") { val sqlCtx = sqlContext diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/BroadcastVsRddRuleProcessingExperiments.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/BroadcastVsRddRuleProcessingExperiments.scala index e3e13f4..c59011b 100644 --- a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/BroadcastVsRddRuleProcessingExperiments.scala +++ b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/BroadcastVsRddRuleProcessingExperiments.scala @@ -85,8 +85,8 @@ object BroadcastVsRddRuleProcessingExperiments extends Profiler{ } /* - rdfs7 aaa rdfs:subPropertyOf bbb . - xxx aaa yyy . xxx bbb yyy . + rdfs7 aaa rdfs:subPropertyOf bbb . + xxx aaa yyy . xxx bbb yyy . */ def rddOnly(triples: RDD[Triple]): RDD[Triple] = { diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/RDFGraphMaterializerTest.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/RDFGraphMaterializerTest.scala index 223df1e..01bb5b1 100644 --- a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/RDFGraphMaterializerTest.scala +++ b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/RDFGraphMaterializerTest.scala @@ -1,13 +1,14 @@ package net.sansa_stack.inference.spark.rules +import scala.collection.mutable + +import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.ModelFactory import org.apache.spark.{SparkConf, SparkContext} -import scala.collection.mutable import net.sansa_stack.inference.spark.data.model.RDFGraph import net.sansa_stack.inference.spark.data.writer.RDFGraphWriter import net.sansa_stack.inference.spark.forwardchaining.triples.ForwardRuleReasonerRDFS -import org.apache.jena.graph.{Node, NodeFactory, Triple} /** * The class to compute the materialization of a given RDF graph. diff --git a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/SetOfRulesTest.scala b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/SetOfRulesTest.scala index b25d3cc..26c7bc0 100644 --- a/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/SetOfRulesTest.scala +++ b/sansa-inference-spark/src/test/scala/net/sansa_stack/inference/spark/rules/SetOfRulesTest.scala @@ -1,17 +1,18 @@ package net.sansa_stack.inference.spark.rules +import scala.collection.mutable + +import org.apache.jena.graph.Triple +import org.apache.jena.reasoner.rulesys.Rule +import org.apache.jena.vocabulary.{OWL2, RDF, RDFS} +import org.apache.spark.sql.SparkSession + import net.sansa_stack.inference.data.JenaOps import net.sansa_stack.inference.spark.data.loader.RDFGraphLoader import net.sansa_stack.inference.spark.data.model.RDFGraphNative import net.sansa_stack.inference.spark.data.writer.RDFGraphWriter import net.sansa_stack.inference.spark.forwardchaining.triples.{ForwardRuleReasonerNaive, ForwardRuleReasonerOptimizedNative} import net.sansa_stack.inference.utils.RuleUtils -import org.apache.jena.graph.Triple -import org.apache.jena.reasoner.rulesys.Rule -import org.apache.jena.vocabulary.{OWL2, RDF, RDFS} -import org.apache.spark.sql.SparkSession - -import scala.collection.mutable /** * A forward chaining implementation of the RDFS entailment regime. @@ -25,7 +26,7 @@ object SetOfRulesTest { // .master("spark://me-ThinkPad-W510:7077") .master("local[4]") .config("spark.eventLog.enabled", "true") - .config("spark.hadoop.validateOutputSpecs", "false") //override output files + .config("spark.hadoop.validateOutputSpecs", "false") // override output files .config("spark.default.parallelism", "4") .config("spark.sql.shuffle.partitions", "8") // .config("spark.jars", "/home/me/work/projects/scala/Spark-Sem-I/target/inference-spark-0.1-SNAPSHOT.jar") @@ -48,7 +49,7 @@ object SetOfRulesTest { val numberOfTriples = graph.size() println("#Triples:" + numberOfTriples) - val rules = RuleUtils.load("rdfs-simple.rules")//.filter(r => ruleNames.contains(r.getName)) + val rules = RuleUtils.load("rdfs-simple.rules")// .filter(r => ruleNames.contains(r.getName)) // runNaive(graph, rules) // runNative(graph, rules) diff --git a/sansa-inference-tests/pom.xml b/sansa-inference-tests/pom.xml index 9e746da..795ada6 100644 --- a/sansa-inference-tests/pom.xml +++ b/sansa-inference-tests/pom.xml @@ -4,12 +4,12 @@ sansa-inference-parent_2.11 net.sansa-stack - 0.3.0 + 0.4.0 ../pom.xml net.sansa-stack sansa-inference-tests_${scala.binary.version} - 0.3.0 + 0.4.0 Inference API - Tests Contains common data and utils for inference API testing @@ -54,6 +54,11 @@ org.scalatest scalatest_${scala.binary.version} + + + com.typesafe.scala-logging + scala-logging_${scala.binary.version} + diff --git a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala index c770b73..7c37c7a 100644 --- a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala +++ b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/ConformanceTestBase.scala @@ -1,17 +1,18 @@ package net.sansa_stack.test.conformance -import java.io.File +import java.io.{File, StringWriter} +import java.nio.file.{Path, Paths} +import net.sansa_stack.inference.data.{RDF, RDFOps} import org.apache.jena.rdf.model.Model -import org.junit.runner.RunWith -import net.sansa_stack.inference.data.{RDF, RDFOps, RDFTriple} import org.apache.jena.shared.PrefixMapping -import org.apache.jena.sparql.util.{FmtUtils, PrefixMapping2} +import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfterAll, FlatSpec} - import scala.collection.mutable +import net.sansa_stack.test.conformance.TestCases.getClass + /** * The class is to test the conformance of each materialization rule of RDFS(simple) entailment. * @@ -21,39 +22,46 @@ import scala.collection.mutable @RunWith(classOf[JUnitRunner]) abstract class ConformanceTestBase[Rdf <: RDF](val rdfOps: RDFOps[Rdf]) extends FlatSpec with BeforeAndAfterAll { + val logger = com.typesafe.scalalogging.Logger("ConformanceTestBase") + behavior of "" // the test case IDs def testCaseIds: Set[String] - // the base directory of the test cases - def testsCasesFolder: File + def testsCasesFolder: String = testCasesPath // this.getClass.getClassLoader.getResource(testCasesPath).getPath +// def testsCasesFolder: File = null // new File(this.getClass.getClassLoader.getResource(testCasesPath).getPath) + + def testCasesPath: String - val pm = PrefixMapping.Factory.create() + private val pm = PrefixMapping.Factory.create() .setNsPrefix("ex", "http://www.example.org#") .setNsPrefix("", "http://www.example.org#") .withDefaultMappings(PrefixMapping.Standard) // load the test cases - val testCases = TestCases.loadTestCases(testsCasesFolder).filter(t => testCaseIds.contains(t.id)) - - testCases.foreach{testCase => - println(testCase.id) + lazy val testCases = TestCases.loadTestCasesJar(testsCasesFolder, testCaseIds) + // scalastyle:off println + testCases.foreach { testCase => testCase.id should "produce the same graph" in { val triples = new mutable.HashSet[Rdf#Triple]() // convert to internal triples val iterator = testCase.inputGraph.listStatements() - while(iterator.hasNext) { + while (iterator.hasNext) { val st = iterator.next() triples.add( rdfOps.makeTriple( rdfOps.makeUri(st.getSubject.toString), rdfOps.makeUri(st.getPredicate.toString), - if(st.getObject.isLiteral) - rdfOps.makeLiteral(st.getObject.asLiteral().getLexicalForm, rdfOps.makeUri(st.getObject.asLiteral().getDatatypeURI)) - else rdfOps.makeUri(st.getObject.toString))) + if (st.getObject.isLiteral) { + rdfOps.makeLiteral(st.getObject.asLiteral().getLexicalForm, rdfOps.makeUri(st.getObject.asLiteral().getDatatypeURI)) + } else { + rdfOps.makeUri(st.getObject.toString) + } + ) + ) } // compute inferred graph @@ -63,14 +71,20 @@ abstract class ConformanceTestBase[Rdf <: RDF](val rdfOps: RDFOps[Rdf]) extends // remove the input triples such that we can compare only the conclusion graph inferredModel.remove(testCase.inputGraph) - println("#" * 80 + "\ninput:") - testCase.inputGraph.write(System.out, "TURTLE") + logger.whenDebugEnabled { + println("#" * 80 + "\ninput:") + testCase.inputGraph.write(System.out, "TURTLE") + } - println("#" * 80 + "\nexpected output:") - testCase.outputGraph.write(System.out, "TURTLE") + logger.whenDebugEnabled { + println("#" * 80 + "\nexpected output:") + testCase.outputGraph.write(System.out, "TURTLE") + } - println("#" * 80 + "\ngot output:") - inferredModel.write(System.out, "TURTLE") + logger.whenDebugEnabled { + println("#" * 80 + "\ngot output:") + inferredModel.write(System.out, "TURTLE") + } // compare models, i.e. the inferred model should contain exactly the triples of the conclusion graph val correctOutput = inferredModel.containsAll(testCase.outputGraph) @@ -82,5 +96,4 @@ abstract class ConformanceTestBase[Rdf <: RDF](val rdfOps: RDFOps[Rdf]) extends } def computeInferredModel(triples: mutable.HashSet[Rdf#Triple]): Model - } diff --git a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/OWLHorstConformanceTestBase.scala b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/OWLHorstConformanceTestBase.scala index dd05908..6e7cc48 100644 --- a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/OWLHorstConformanceTestBase.scala +++ b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/OWLHorstConformanceTestBase.scala @@ -1,8 +1,6 @@ package net.sansa_stack.test.conformance -import java.io.File - -import net.sansa_stack.inference.data.{JenaOps, RDF, RDFOps} +import net.sansa_stack.inference.data.{RDF, RDFOps} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -18,6 +16,8 @@ abstract class OWLHorstConformanceTestBase[Rdf <: RDF](override val rdfOps: RDFO behavior of "conformance of OWL Horst entailment rules" + override def testCasesPath: String = "data/conformance/owl2rl" + override def testCaseIds: Set[String] = Set( "rdfbased-sem-rdfs-domain-cond", "rdfbased-sem-rdfs-range-cond", @@ -40,6 +40,4 @@ abstract class OWLHorstConformanceTestBase[Rdf <: RDF](override val rdfOps: RDFO "rdfbased-sem-restrict-somevalues-inst-subj", "rdfbased-sem-restrict-allvalues-inst-obj" ) - - override def testsCasesFolder: File = new File(this.getClass.getClassLoader.getResource("data/conformance/owl2rl").getPath) } diff --git a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/RDFSConformanceTestBase.scala b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/RDFSConformanceTestBase.scala index 0c79d6b..df2a03e 100644 --- a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/RDFSConformanceTestBase.scala +++ b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/RDFSConformanceTestBase.scala @@ -1,8 +1,6 @@ package net.sansa_stack.test.conformance -import java.io.File - -import net.sansa_stack.inference.data.{Jena, RDF, RDFOps} +import net.sansa_stack.inference.data.{RDF, RDFOps} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -18,6 +16,8 @@ abstract class RDFSConformanceTestBase[Rdf <: RDF](override val rdfOps: RDFOps[R behavior of "conformance of RDFS(simple) entailment rules" + override def testCasesPath: String = "data/conformance/rdfs" + override def testCaseIds: Set[String] = Set( "rdfbased-sem-rdfs-domain-cond", "rdfbased-sem-rdfs-range-cond", @@ -25,6 +25,4 @@ abstract class RDFSConformanceTestBase[Rdf <: RDF](override val rdfOps: RDFOps[R "rdfbased-sem-rdfs-subclass-trans", "rdfbased-sem-rdfs-subprop-cond", "rdfbased-sem-rdfs-subprop-trans") - - override def testsCasesFolder: File = new File(this.getClass.getClassLoader.getResource("data/conformance/rdfs").getPath) } diff --git a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/TestCase.scala b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/TestCase.scala index 449787d..3993976 100644 --- a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/TestCase.scala +++ b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/TestCase.scala @@ -7,6 +7,4 @@ import org.apache.jena.rdf.model.Model * * @author Lorenz Buehmann */ -case class TestCase (id: String, description: String, testCaseType: String, inputGraph: Model, outputGraph: Model){ - -} +case class TestCase(id: String, description: String, testCaseType: String, inputGraph: Model, outputGraph: Model) {} diff --git a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/TestCases.scala b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/TestCases.scala index a0705f8..a1b9df5 100644 --- a/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/TestCases.scala +++ b/sansa-inference-tests/src/test/scala/net/sansa_stack/test/conformance/TestCases.scala @@ -1,12 +1,16 @@ package net.sansa_stack.test.conformance import java.io.File +import java.nio.file.{FileSystem, Path} -import org.apache.jena.riot.RDFDataMgr - +import org.apache.jena.riot.{Lang, RDFDataMgr} import scala.collection.mutable.ListBuffer import scala.xml.XML +import org.apache.commons.io.IOUtils +import org.apache.jena.rdf.model.ModelFactory +import org.scalatest.path + /** * Test cases loader. * @@ -14,6 +18,10 @@ import scala.xml.XML */ object TestCases { + val logger = com.typesafe.scalalogging.Logger("TestCases") + + var fs: FileSystem = null + /** * Loads test cases from the given root folder. * @@ -21,9 +29,12 @@ object TestCases { * @return test cases */ def loadTestCases(directory: File, ids: Set[String] = Set.empty): Seq[TestCase] = { + println(s"loading test cases from ${directory.getAbsolutePath}...") val testCases = new ListBuffer[TestCase]() + println(directory) + directory.listFiles().filter(f => f.isDirectory && (ids.isEmpty || ids.contains(f.getName))).foreach { subDirectory => // the files in the directory @@ -45,7 +56,7 @@ object TestCases { val entailmentType = (metadata \\ "entry").filter(n => n.attribute("key").get.text == "testcase.type").text // currently we support only entailment test cases - if(entailmentType == "POSITIVE_ENTAILMENT") { + if (entailmentType == "POSITIVE_ENTAILMENT") { // load input data val inputGraph = RDFDataMgr.loadModel(files.filter(_.getName.endsWith(".premisegraph.ttl")).head.getPath) @@ -54,8 +65,94 @@ object TestCases { testCases += TestCase(id, description, entailmentType, inputGraph, outputGraph) } + } + println(s"loaded ${testCases.size} test cases") testCases } + + /** + * Loads test cases from the given root folder. + * + * @param directory the root folder containing sub-folders for each test case + * @return test cases + */ + def loadTestCasesJar(directory: String, ids: Set[String] = Set.empty): Seq[TestCase] = { + logger.info(s"loading test cases from ${directory}...") + + val testCases = new ListBuffer[TestCase]() + + listFiles(directory).filter(f => ids.isEmpty || ids.contains(f.getFileName.toString.replace("/", ""))).map { p => + + // the files in the directory + val files = listFiles( + if (p.toUri.getScheme == "jar") p.toString.substring(1) else p.toString, true) + + // get the metadata file + val metadataFile = files.filter(_.toString.endsWith(".metadata.properties")).head + + // load metadata XML + val metadata = XML.load(metadataFile.toUri.toURL.openStream()) + + // id + val id = (metadata \\ "entry").filter(n => n.attribute("key").get.text == "testcase.id").text + + // description + val description = (metadata \\ "entry").filter(n => n.attribute("key").get.text == "testcase.description").text + + // test case type + val entailmentType = (metadata \\ "entry").filter(n => n.attribute("key").get.text == "testcase.type").text + + // currently we support only entailment test cases + if (entailmentType == "POSITIVE_ENTAILMENT") { + // load input data + + val inputGraph = ModelFactory.createDefaultModel() + inputGraph.read(files.filter(_.toString.endsWith(".premisegraph.ttl")).head.toUri.toURL.openStream(), null, "Turtle") + + // load output data + val outputGraph = ModelFactory.createDefaultModel() + outputGraph.read(files.filter(_.toString.endsWith(".conclusiongraph.ttl")).head.toUri.toURL.openStream(), null, "Turtle") + + testCases += TestCase(id, description, entailmentType, inputGraph, outputGraph) + } + } +// directory.listFiles().filter(f => f.isDirectory && (ids.isEmpty || ids.contains(f.getName))).foreach { subDirectory => + + + println(s"loaded ${testCases.size} test cases") + + if(fs != null) fs.close() + + testCases + } + + private def listFiles(path: String, subDir: Boolean = false): Seq[Path] = { + import java.nio.file.FileSystems + import java.nio.file.Files + import java.nio.file.Paths + import java.util.Collections + +// println(s"path: $path") + val uri = if (path.startsWith("/")) new File(path).toURI else classOf[TestCase].getClassLoader.getResource(path).toURI +// println(s"uri: $uri") + var myPath: Path = null + if (uri.getScheme == "jar" && !subDir) { + fs = FileSystems.newFileSystem(uri, Collections.emptyMap[String, Any]) + myPath = fs.getPath(path) + } + else myPath = Paths.get(uri) + val walk = Files.walk(myPath, 1) + val it = walk.iterator + var files = Seq[Path]() + while ({it.hasNext}) { + val subPath = it.next() + if(!subPath.equals(myPath)) { + files :+= subPath + } + } + + files + } } diff --git a/scalastyle-config.xml b/scalastyle-config.xml index 67ad459..f9218e2 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -116,7 +116,7 @@ This file is divided into 3 sections: - + @@ -142,13 +142,13 @@ This file is divided into 3 sections: - - ^println$ - - + + + + + + + @VisibleForTesting @@ -222,15 +222,15 @@ This file is divided into 3 sections: is slower. - - - java,scala,3rdParty,sansa - javax?\..* - scalax?\..* - (?!net\.sansa_stack\.inference\.).* - net\.sansa_stack\.inference\..* - - + + + + + + + + +