Merge pull request #185 from justin2004/tdb2-exploration

option to use an on disk graph (TDB2)
SPARQL-Anything · Dec 22, 2021 · 83d9156 · 83d9156
2 parents bc785fe + 68a3ddb
commit 83d9156
Show file tree

Hide file tree

Showing 7 changed files with 158 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ sparql.anything.experiment/src/main/java/com/github/spiceh2020/sparql/anything/e
 /log/
 *swp
 *swo
+*/tmp/
diff --git a/README.md b/README.md
@@ -304,6 +304,8 @@ WHERE {
 |triplifier|It forces sparql.anything to use a specific triplifier for transforming the data source|A canonical name of a Java class|No value|
 |charset|The charset of the data source.|Any charset.|UTF-8|
 |metadata|It tells sparql.anything to extract metadata from the data source and to store it in the named graph with URI &lt;http://sparql.xyz/facade-x/data/metadata&gt;  |true/false|false|
+|ondisk|It tells sparql.anything to use an on disk graph (instead of the default in memory graph). The string should be a path to a directory where the on disk graph will be stored. Using an on disk graph is almost always slower (than using the default in memory graph) but with it you can triplify large files without running out of memory.|a path to a directory|not set|
+|ondisk.reuse|When using an on disk graph, it tells sparql.anything to reuse the previous on disk graph.|true|not set|
 |strategy|The execution strategy. 0 = in memory, all triples; 1 = in memory, only triples matching any of the triple patterns in the where clause|0,1|1|
 
 \* It is mandatory to provide either the local or the content.

diff --git a/sparql-anything-csv/src/main/java/com/github/sparqlanything/csv/CSVTriplifier.java b/sparql-anything-csv/src/main/java/com/github/sparqlanything/csv/CSVTriplifier.java
@@ -131,6 +131,9 @@ public DatasetGraph triplify(Properties properties, FacadeXGraphBuilder builder)
 					log.trace(" > is data {}", rown);
 					// Rows
 					rown++;
+					if((rown % 10000)==0){
+						log.debug("current row num: {}", rown);
+					}
 					String rowContainerId = containerRowPrefix + rown;
 					builder.addContainer(dataSourceId, root, rown, rowContainerId);
 					CSVRecord record = recordIterator.next();

diff --git a/sparql-anything-csv/src/test/java/com/github/sparqlanything/csv/CSVTriplifierTest.java b/sparql-anything-csv/src/test/java/com/github/sparqlanything/csv/CSVTriplifierTest.java
@@ -34,11 +34,15 @@
 import org.apache.jena.sparql.graph.GraphFactory;
 import org.junit.Test;
 import static org.junit.Assert.fail;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.jena.query.TxnType;
 
 import com.github.sparqlanything.model.IRIArgument;
 
 public class CSVTriplifierTest {
 	private CSVTriplifier triplifier = new CSVTriplifier();
+	public static final Logger log = LoggerFactory.getLogger(CSVTriplifierTest.class);
 
 	@Test
 	public void testCsvNullStrings() throws IOException, TriplifierHTTPException {
@@ -115,4 +119,55 @@ public void testTab() throws IOException, TriplifierHTTPException {
 			System.err.println(t);
 		}
 	}
+
+	@Test
+	public void testWithOnDiskGraph1 () throws IOException, TriplifierHTTPException {
+		Properties properties = new Properties();
+		properties.setProperty("namespace", "http://www.example.org#");
+		properties.setProperty("ondisk", "/tmp");
+		URL csv1 = getClass().getClassLoader().getResource("./test3.csv");
+		properties.setProperty(IRIArgument.LOCATION.toString(), csv1.toString());
+		DatasetGraph graph = triplifier.triplify(properties, new BaseFacadeXBuilder(csv1.toString(), properties));
+
+		// end the write txn because triplifiers don't do that, FacadeXOpExecutor does
+		graph.commit();
+		graph.end();
+
+		graph.begin(TxnType.READ);
+		Iterator<Quad> iter = graph.find(null, null, null, null);
+		Integer count=0 ;
+		while (iter.hasNext()) {
+			count++;
+			Quad t = iter.next();
+		}
+		if(count!=21){
+			fail("expected 21 quads but found " + count);
+		}
+		graph.end();
+	}
+
+	@Test
+	public void testWithOnDiskGraph2 () throws IOException, TriplifierHTTPException {
+		Properties properties = new Properties();
+		properties.setProperty("namespace", "http://www.example.org#");
+		properties.setProperty("ondisk", "/tmp");
+		URL csv1 = getClass().getClassLoader().getResource("./test1.csv");
+		properties.setProperty(IRIArgument.LOCATION.toString(), csv1.toString());
+		DatasetGraph graph = triplifier.triplify(properties, new BaseFacadeXBuilder(csv1.toString(), properties));
+		// end the write txn because triplifiers don't do that, FacadeXOpExecutor does
+		graph.commit();
+		graph.end();
+
+		graph.begin(TxnType.READ);
+		Iterator<Quad> iter = graph.find(null, null, null, null);
+		Integer count=0 ;
+		while (iter.hasNext()) {
+			count++;
+			Quad t = iter.next();
+		}
+		if(count!=13){
+			fail("expected 13 quads but found " + count);
+		}
+		graph.end();
+	}
 }
diff --git a/sparql-anything-engine/src/main/java/com/github/sparqlanything/engine/FacadeXOpExecutor.java b/sparql-anything-engine/src/main/java/com/github/sparqlanything/engine/FacadeXOpExecutor.java
@@ -38,6 +38,7 @@
 import org.apache.jena.graph.Triple;
 import org.apache.jena.query.ARQ;
 import org.apache.jena.query.DatasetFactory;
+import org.apache.jena.tdb2.TDB2Factory;
 import org.apache.jena.rdf.model.Model;
 import org.apache.jena.rdf.model.ModelFactory;
 import org.apache.jena.rdf.model.Resource;
@@ -68,6 +69,7 @@
 import org.apache.jena.sparql.util.Symbol;
 import org.apache.jena.vocabulary.RDF;
 import org.apache.jena.vocabulary.VOID;
+import org.apache.jena.query.TxnType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -159,6 +161,11 @@ private DatasetGraph getDatasetGraph(Properties p, Op op) throws IOException, In
 
 		logger.trace("Triplifier {}\n{}", t.getClass().toString(), op.toString());
 		dg = triplify(op, p, t);
+
+		logger.debug("triplification done -- commiting and ending the write txn");
+		dg.commit();
+		dg.end();
+
 		if (urlLocation != null) {
 			logger.trace("Location provided {}", urlLocation);
 			URL url = Triplifier.instantiateURL(urlLocation);
@@ -170,6 +177,7 @@ private DatasetGraph getDatasetGraph(Properties p, Op op) throws IOException, In
 			executedFacadeXIris.put(getInMemoryCacheKey(p, op), dg);
 			logger.debug("Graph added to in-memory cache");
 		}
+		// TODO wrap this in a txn or move it to a place where we are already in a txn
 		// logger.trace("Triplified, #triples in default graph {} {}", dg.getDefaultGraph().size(), op.toString());
 
 //		else {
@@ -326,7 +334,9 @@ private DatasetGraph triplify(final Op op, Properties p, Triplifier t) throws IO
 			logger.error("No triplifier available for the input format!");
 			dg = DatasetFactory.create().asDatasetGraph();
 		}
-		//logger.trace("Default graph size {}", dg.getDefaultGraph().size());
+
+		// logger.trace("Union graph size {}",dg.getUnionGraph().size());
+		// logger.trace("Default graph size {}", dg.getDefaultGraph().size());
 		return dg;
 	}
 
@@ -531,10 +541,19 @@ protected QueryIterator execute(final OpPath s, QueryIterator input) {
 	}
 
 	protected QueryIterator execute(final OpBGP opBGP, QueryIterator input) {
-		
+
 		if(this.execCxt.getClass()!=FacadeXExecutionContext.class) {
 			return super.execute(opBGP, input);
 		}
+
+		// i think we can consider this the start of the query execution and therefore the read txn.
+		// we won't end this read txn until the next query takes execution back through BaseFacadeXBuilder
+		if(!this.execCxt.getDataset().isInTransaction()){
+			// i think we need the test (instead of just unconditionally starting the txn) because if we postpone
+			// during a query execution, execution could pass through here again
+			logger.debug("begin read txn");
+			this.execCxt.getDataset().begin(TxnType.READ);
+		}
 
 		logger.trace("executing  BGP {}", opBGP.toString());
 		logger.trace("Size: {} {}", this.execCxt.getDataset().size(),

diff --git a/sparql-anything-model/src/main/java/com/github/sparqlanything/model/BaseFacadeXBuilder.java b/sparql-anything-model/src/main/java/com/github/sparqlanything/model/BaseFacadeXBuilder.java
@@ -22,34 +22,98 @@
 import org.apache.jena.graph.Triple;
 import org.apache.jena.rdf.model.*;
 import org.apache.jena.sparql.core.DatasetGraph;
+import org.apache.jena.tdb2.TDB2Factory;
 import org.apache.jena.sparql.core.DatasetGraphFactory;
+import org.apache.jena.query.Dataset;
 import org.apache.jena.vocabulary.RDF;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.apache.commons.io.FileUtils;
+import org.apache.jena.query.TxnType;
 
 import java.net.URI;
+import java.io.File;
+import java.nio.file.Paths;
+import java.nio.file.Files;
 import java.util.Properties;
 
 public class BaseFacadeXBuilder implements FacadeXGraphBuilder {
+	private static final String PROPERTY_ONDISK_REUSE = "ondisk.reuse";
+	private static final String PROPERTY_ONDISK = "ondisk";
 	protected static final Logger log = LoggerFactory.getLogger(TripleFilteringFacadeXBuilder.class);
 	protected final Properties properties;
 	protected final Node mainGraphName;
+
+	// when using a TDB2 this is defined 
+	protected static Dataset dataset = null; // TODO making this static is a kludge maybe
 	protected final DatasetGraph datasetGraph;
 	//
 	protected final boolean p_blank_nodes;
 	protected final String p_namespace;
 	protected final String p_root;
 	protected final boolean p_trim_strings;
 	protected final String p_null_string;
+	protected static String previousTDB2Path = "";
 
 	public BaseFacadeXBuilder(String resourceId, Properties properties) {
-		this(resourceId, DatasetGraphFactory.create(), properties);
+		this(resourceId, null, properties);
+	}
+
+	// this is where all graph (graphs that we actually put triples in) creation happens
+	public static DatasetGraph getDatasetGraph(Properties properties){
+		DatasetGraph dsg;
+		String TDB2Path = "" ;
+		boolean ONDISK = properties.containsKey(PROPERTY_ONDISK);
+		boolean ONDISK_REUSE = properties.containsKey(PROPERTY_ONDISK_REUSE); // TODO any string counts as "true"
+
+		if(ONDISK){
+			if(BaseFacadeXBuilder.previousTDB2Path != "" && ONDISK_REUSE){
+				TDB2Path = previousTDB2Path ;
+			}else{
+				try{
+					if(previousTDB2Path!=""){
+						log.debug("deleting previous TDB2 at: {}",previousTDB2Path);
+						FileUtils.deleteDirectory(new File(previousTDB2Path)) ;
+					}
+					if(Files.isDirectory(Paths.get(properties.getProperty(PROPERTY_ONDISK)))){
+						TDB2Path = Files.createTempDirectory(Paths.get(properties.getProperty(PROPERTY_ONDISK)),"").toString();
+					}else{
+						log.debug("the specified path is not a directory: {}\nusing /tmp instead", 
+								properties.getProperty(PROPERTY_ONDISK));
+						TDB2Path = Files.createTempDirectory(Paths.get("/tmp"),"").toString();
+					}
+					// store the TDB2Path for next time (in case we want to reuse it or delete it)
+					BaseFacadeXBuilder.previousTDB2Path = TDB2Path;
+				}catch(Exception ex){
+					log.error(ex.toString());
+				}
+			}
+			log.debug("using on disk TBD2 at: {}", TDB2Path);
+			dataset = TDB2Factory.connectDataset(TDB2Path);
+			dsg = dataset.asDatasetGraph();
+			if(dsg.isInTransaction()){
+				// if we are reusing the same TDB2 then this will be true so
+				// end the read txn from the previous query
+				dsg.end();
+			}
+		}else{
+			log.debug("using in memory DatasetGraph");
+			// i don't think we ever reuse the same in memory DatasetGraph
+			// so no need to end the previous query's read txn
+			dsg = DatasetGraphFactory.create() ;
+		}
+		return dsg;
 	}
 
 	protected BaseFacadeXBuilder(String resourceId, DatasetGraph ds, Properties properties) {
 		this.properties = properties;
 		this.mainGraphName = NodeFactory.createURI(resourceId);
-		this.datasetGraph = ds;
+		this.datasetGraph = BaseFacadeXBuilder.getDatasetGraph(properties);
+
+		// the single place to begin write txns
+		log.debug("begin write txn");
+		this.datasetGraph.begin(TxnType.WRITE);
+
 		this.p_blank_nodes = Triplifier.getBlankNodeArgument(properties);
 		this.p_trim_strings = Triplifier.getTrimStringsArgument(properties);
 		this.p_null_string = Triplifier.getNullStringArgument(properties);
@@ -73,7 +137,6 @@ public boolean add(Node graph, Node subject, Node predicate, Node object) {
 		if(p_null_string != null && object.isLiteral() && object.getLiteral().toString().equals(p_null_string)){
 			return false;
 		}
-
 		Triple t = new Triple(subject, predicate, object);
 		if (datasetGraph.getGraph(graph).contains(t)) {
 			return false;
@@ -175,7 +238,13 @@ public Model getModel() {
 
 	@Override
 	public DatasetGraph getDatasetGraph() {
-		datasetGraph.setDefaultGraph(datasetGraph.getUnionGraph());
+		if(dataset == null){
+			// we have an in memory DatasetGraph
+			datasetGraph.setDefaultGraph(datasetGraph.getUnionGraph());
+			// we are unable to do that ^ with an on disk DatasetGraph (TDB2)
+			// so that means you need to do `graph ?g {?s ?p ?o}` instead of simply
+			// `{?s ?p ?o}` in a query when you use a TDB2
+		}
 		return datasetGraph;
 	}
 

diff --git a/...ng-model/src/main/java/com/github/sparqlanything/model/TripleFilteringFacadeXBuilder.java b/...ng-model/src/main/java/com/github/sparqlanything/model/TripleFilteringFacadeXBuilder.java
@@ -100,8 +100,9 @@ public TripleFilteringFacadeXBuilder(String resourceId, Op op, DatasetGraph ds,
 	}
 
 	public TripleFilteringFacadeXBuilder(String resourceId, Op op, Properties properties) {
-		this(resourceId, op, DatasetGraphFactory.create(), properties);
-//        this(resourceId, op, DatasetGraphFactory.createTxnMem(), properties);
+		// don't make a DatasetGraph here
+		// instead let BaseFacadeXBuilder do all the DatasetGraph making
+		this(resourceId, op, null, properties);
 	}
 //
 //	public TripleFilteringFacadeXBuilder(URL location, Op op, Properties properties) {
-Original file line number
+Diff line change
@@ Expand Up @@
     /log/
     *swp
     *swo
+    */tmp/