Skip to content

Commit

Permalink
Merge pull request #185 from justin2004/tdb2-exploration
Browse files Browse the repository at this point in the history
option to use an on disk graph (TDB2)
  • Loading branch information
enridaga authored Dec 22, 2021
2 parents bc785fe + 68a3ddb commit 83d9156
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 8 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ sparql.anything.experiment/src/main/java/com/github/spiceh2020/sparql/anything/e
/log/
*swp
*swo
*/tmp/
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ WHERE {
|triplifier|It forces sparql.anything to use a specific triplifier for transforming the data source|A canonical name of a Java class|No value|
|charset|The charset of the data source.|Any charset.|UTF-8|
|metadata|It tells sparql.anything to extract metadata from the data source and to store it in the named graph with URI <http://sparql.xyz/facade-x/data/metadata> |true/false|false|
|ondisk|It tells sparql.anything to use an on disk graph (instead of the default in memory graph). The string should be a path to a directory where the on disk graph will be stored. Using an on disk graph is almost always slower (than using the default in memory graph) but with it you can triplify large files without running out of memory.|a path to a directory|not set|
|ondisk.reuse|When using an on disk graph, it tells sparql.anything to reuse the previous on disk graph.|true|not set|
|strategy|The execution strategy. 0 = in memory, all triples; 1 = in memory, only triples matching any of the triple patterns in the where clause|0,1|1|

\* It is mandatory to provide either the local or the content.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ public DatasetGraph triplify(Properties properties, FacadeXGraphBuilder builder)
log.trace(" > is data {}", rown);
// Rows
rown++;
if((rown % 10000)==0){
log.debug("current row num: {}", rown);
}
String rowContainerId = containerRowPrefix + rown;
builder.addContainer(dataSourceId, root, rown, rowContainerId);
CSVRecord record = recordIterator.next();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,15 @@
import org.apache.jena.sparql.graph.GraphFactory;
import org.junit.Test;
import static org.junit.Assert.fail;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.jena.query.TxnType;

import com.github.sparqlanything.model.IRIArgument;

public class CSVTriplifierTest {
private CSVTriplifier triplifier = new CSVTriplifier();
public static final Logger log = LoggerFactory.getLogger(CSVTriplifierTest.class);

@Test
public void testCsvNullStrings() throws IOException, TriplifierHTTPException {
Expand Down Expand Up @@ -115,4 +119,55 @@ public void testTab() throws IOException, TriplifierHTTPException {
System.err.println(t);
}
}

@Test
public void testWithOnDiskGraph1 () throws IOException, TriplifierHTTPException {
Properties properties = new Properties();
properties.setProperty("namespace", "http://www.example.org#");
properties.setProperty("ondisk", "/tmp");
URL csv1 = getClass().getClassLoader().getResource("./test3.csv");
properties.setProperty(IRIArgument.LOCATION.toString(), csv1.toString());
DatasetGraph graph = triplifier.triplify(properties, new BaseFacadeXBuilder(csv1.toString(), properties));

// end the write txn because triplifiers don't do that, FacadeXOpExecutor does
graph.commit();
graph.end();

graph.begin(TxnType.READ);
Iterator<Quad> iter = graph.find(null, null, null, null);
Integer count=0 ;
while (iter.hasNext()) {
count++;
Quad t = iter.next();
}
if(count!=21){
fail("expected 21 quads but found " + count);
}
graph.end();
}

@Test
public void testWithOnDiskGraph2 () throws IOException, TriplifierHTTPException {
Properties properties = new Properties();
properties.setProperty("namespace", "http://www.example.org#");
properties.setProperty("ondisk", "/tmp");
URL csv1 = getClass().getClassLoader().getResource("./test1.csv");
properties.setProperty(IRIArgument.LOCATION.toString(), csv1.toString());
DatasetGraph graph = triplifier.triplify(properties, new BaseFacadeXBuilder(csv1.toString(), properties));
// end the write txn because triplifiers don't do that, FacadeXOpExecutor does
graph.commit();
graph.end();

graph.begin(TxnType.READ);
Iterator<Quad> iter = graph.find(null, null, null, null);
Integer count=0 ;
while (iter.hasNext()) {
count++;
Quad t = iter.next();
}
if(count!=13){
fail("expected 13 quads but found " + count);
}
graph.end();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import org.apache.jena.graph.Triple;
import org.apache.jena.query.ARQ;
import org.apache.jena.query.DatasetFactory;
import org.apache.jena.tdb2.TDB2Factory;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Resource;
Expand Down Expand Up @@ -68,6 +69,7 @@
import org.apache.jena.sparql.util.Symbol;
import org.apache.jena.vocabulary.RDF;
import org.apache.jena.vocabulary.VOID;
import org.apache.jena.query.TxnType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -159,6 +161,11 @@ private DatasetGraph getDatasetGraph(Properties p, Op op) throws IOException, In

logger.trace("Triplifier {}\n{}", t.getClass().toString(), op.toString());
dg = triplify(op, p, t);

logger.debug("triplification done -- commiting and ending the write txn");
dg.commit();
dg.end();

if (urlLocation != null) {
logger.trace("Location provided {}", urlLocation);
URL url = Triplifier.instantiateURL(urlLocation);
Expand All @@ -170,6 +177,7 @@ private DatasetGraph getDatasetGraph(Properties p, Op op) throws IOException, In
executedFacadeXIris.put(getInMemoryCacheKey(p, op), dg);
logger.debug("Graph added to in-memory cache");
}
// TODO wrap this in a txn or move it to a place where we are already in a txn
// logger.trace("Triplified, #triples in default graph {} {}", dg.getDefaultGraph().size(), op.toString());

// else {
Expand Down Expand Up @@ -326,7 +334,9 @@ private DatasetGraph triplify(final Op op, Properties p, Triplifier t) throws IO
logger.error("No triplifier available for the input format!");
dg = DatasetFactory.create().asDatasetGraph();
}
//logger.trace("Default graph size {}", dg.getDefaultGraph().size());

// logger.trace("Union graph size {}",dg.getUnionGraph().size());
// logger.trace("Default graph size {}", dg.getDefaultGraph().size());
return dg;
}

Expand Down Expand Up @@ -531,10 +541,19 @@ protected QueryIterator execute(final OpPath s, QueryIterator input) {
}

protected QueryIterator execute(final OpBGP opBGP, QueryIterator input) {

if(this.execCxt.getClass()!=FacadeXExecutionContext.class) {
return super.execute(opBGP, input);
}

// i think we can consider this the start of the query execution and therefore the read txn.
// we won't end this read txn until the next query takes execution back through BaseFacadeXBuilder
if(!this.execCxt.getDataset().isInTransaction()){
// i think we need the test (instead of just unconditionally starting the txn) because if we postpone
// during a query execution, execution could pass through here again
logger.debug("begin read txn");
this.execCxt.getDataset().begin(TxnType.READ);
}

logger.trace("executing BGP {}", opBGP.toString());
logger.trace("Size: {} {}", this.execCxt.getDataset().size(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,34 +22,98 @@
import org.apache.jena.graph.Triple;
import org.apache.jena.rdf.model.*;
import org.apache.jena.sparql.core.DatasetGraph;
import org.apache.jena.tdb2.TDB2Factory;
import org.apache.jena.sparql.core.DatasetGraphFactory;
import org.apache.jena.query.Dataset;
import org.apache.jena.vocabulary.RDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.commons.io.FileUtils;
import org.apache.jena.query.TxnType;

import java.net.URI;
import java.io.File;
import java.nio.file.Paths;
import java.nio.file.Files;
import java.util.Properties;

public class BaseFacadeXBuilder implements FacadeXGraphBuilder {
private static final String PROPERTY_ONDISK_REUSE = "ondisk.reuse";
private static final String PROPERTY_ONDISK = "ondisk";
protected static final Logger log = LoggerFactory.getLogger(TripleFilteringFacadeXBuilder.class);
protected final Properties properties;
protected final Node mainGraphName;

// when using a TDB2 this is defined
protected static Dataset dataset = null; // TODO making this static is a kludge maybe
protected final DatasetGraph datasetGraph;
//
protected final boolean p_blank_nodes;
protected final String p_namespace;
protected final String p_root;
protected final boolean p_trim_strings;
protected final String p_null_string;
protected static String previousTDB2Path = "";

public BaseFacadeXBuilder(String resourceId, Properties properties) {
this(resourceId, DatasetGraphFactory.create(), properties);
this(resourceId, null, properties);
}

// this is where all graph (graphs that we actually put triples in) creation happens
public static DatasetGraph getDatasetGraph(Properties properties){
DatasetGraph dsg;
String TDB2Path = "" ;
boolean ONDISK = properties.containsKey(PROPERTY_ONDISK);
boolean ONDISK_REUSE = properties.containsKey(PROPERTY_ONDISK_REUSE); // TODO any string counts as "true"

if(ONDISK){
if(BaseFacadeXBuilder.previousTDB2Path != "" && ONDISK_REUSE){
TDB2Path = previousTDB2Path ;
}else{
try{
if(previousTDB2Path!=""){
log.debug("deleting previous TDB2 at: {}",previousTDB2Path);
FileUtils.deleteDirectory(new File(previousTDB2Path)) ;
}
if(Files.isDirectory(Paths.get(properties.getProperty(PROPERTY_ONDISK)))){
TDB2Path = Files.createTempDirectory(Paths.get(properties.getProperty(PROPERTY_ONDISK)),"").toString();
}else{
log.debug("the specified path is not a directory: {}\nusing /tmp instead",
properties.getProperty(PROPERTY_ONDISK));
TDB2Path = Files.createTempDirectory(Paths.get("/tmp"),"").toString();
}
// store the TDB2Path for next time (in case we want to reuse it or delete it)
BaseFacadeXBuilder.previousTDB2Path = TDB2Path;
}catch(Exception ex){
log.error(ex.toString());
}
}
log.debug("using on disk TBD2 at: {}", TDB2Path);
dataset = TDB2Factory.connectDataset(TDB2Path);
dsg = dataset.asDatasetGraph();
if(dsg.isInTransaction()){
// if we are reusing the same TDB2 then this will be true so
// end the read txn from the previous query
dsg.end();
}
}else{
log.debug("using in memory DatasetGraph");
// i don't think we ever reuse the same in memory DatasetGraph
// so no need to end the previous query's read txn
dsg = DatasetGraphFactory.create() ;
}
return dsg;
}

protected BaseFacadeXBuilder(String resourceId, DatasetGraph ds, Properties properties) {
this.properties = properties;
this.mainGraphName = NodeFactory.createURI(resourceId);
this.datasetGraph = ds;
this.datasetGraph = BaseFacadeXBuilder.getDatasetGraph(properties);

// the single place to begin write txns
log.debug("begin write txn");
this.datasetGraph.begin(TxnType.WRITE);

this.p_blank_nodes = Triplifier.getBlankNodeArgument(properties);
this.p_trim_strings = Triplifier.getTrimStringsArgument(properties);
this.p_null_string = Triplifier.getNullStringArgument(properties);
Expand All @@ -73,7 +137,6 @@ public boolean add(Node graph, Node subject, Node predicate, Node object) {
if(p_null_string != null && object.isLiteral() && object.getLiteral().toString().equals(p_null_string)){
return false;
}

Triple t = new Triple(subject, predicate, object);
if (datasetGraph.getGraph(graph).contains(t)) {
return false;
Expand Down Expand Up @@ -175,7 +238,13 @@ public Model getModel() {

@Override
public DatasetGraph getDatasetGraph() {
datasetGraph.setDefaultGraph(datasetGraph.getUnionGraph());
if(dataset == null){
// we have an in memory DatasetGraph
datasetGraph.setDefaultGraph(datasetGraph.getUnionGraph());
// we are unable to do that ^ with an on disk DatasetGraph (TDB2)
// so that means you need to do `graph ?g {?s ?p ?o}` instead of simply
// `{?s ?p ?o}` in a query when you use a TDB2
}
return datasetGraph;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,9 @@ public TripleFilteringFacadeXBuilder(String resourceId, Op op, DatasetGraph ds,
}

public TripleFilteringFacadeXBuilder(String resourceId, Op op, Properties properties) {
this(resourceId, op, DatasetGraphFactory.create(), properties);
// this(resourceId, op, DatasetGraphFactory.createTxnMem(), properties);
// don't make a DatasetGraph here
// instead let BaseFacadeXBuilder do all the DatasetGraph making
this(resourceId, op, null, properties);
}
//
// public TripleFilteringFacadeXBuilder(URL location, Op op, Properties properties) {
Expand Down

0 comments on commit 83d9156

Please sign in to comment.