Skip to content

Commit

Permalink
Merge branch 'v0.6-DEV' into tdb2-exploration
Browse files Browse the repository at this point in the history
  • Loading branch information
justin2004 committed Dec 22, 2021
2 parents 65784e4 + bc785fe commit fae4782
Show file tree
Hide file tree
Showing 17 changed files with 1,144 additions and 41 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ WHERE {
|metadata|It tells sparql.anything to extract metadata from the data source and to store it in the named graph with URI <http://sparql.xyz/facade-x/data/metadata> |true/false|false|
|ondisk|It tells sparql.anything to use an on disk graph (instead of the default in memory graph). The string should be a path to a directory where the on disk graph will be stored. Using an on disk graph is almost always slower (than using the default in memory graph) but with it you can triplify large files without running out of memory.|a path to a directory|not set|
|ondisk.reuse|When using an on disk graph, it tells sparql.anything to reuse the previous on disk graph.|true|not set|
|strategy|The execution strategy. 0 = in memory, all triples; 1 = in memory, only triples matching any of the triple patterns in the where clause|0,1|1|

\* It is mandatory to provide either the local or the content.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ public static void main(String[] args) throws Exception {
Integer strategy = (commandLine.hasOption(STRATEGY) ? Integer.valueOf(commandLine.getOptionValue(STRATEGY))
: null);
if (strategy != null) {
if (strategy == 1 || strategy == 0) {
if (strategy == 1 || strategy == 0 || strategy == 2) {
ARQ.getContext().set(FacadeXOpExecutor.strategy, strategy);
} else {
logger.error("Invalid value for parameter 'strategy': {}", strategy);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,13 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Properties;
import java.util.Set;
import java.util.*;

import com.github.sparqlanything.model.TriplifierHTTPException;
import com.github.sparqlanything.model.filestream.FileStreamTriplifier;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.io.input.BOMInputStream;
Expand All @@ -42,7 +41,7 @@
import com.github.sparqlanything.model.FacadeXGraphBuilder;
import com.github.sparqlanything.model.Triplifier;

public class CSVTriplifier implements Triplifier {
public class CSVTriplifier implements FileStreamTriplifier {
private static final Logger log = LoggerFactory.getLogger(CSVTriplifier.class);
public final static String PROPERTY_FORMAT = "csv.format", PROPERTY_HEADERS = "csv.headers";
public final static String PROPERTY_DELIMITER = "csv.delimiter";
Expand Down Expand Up @@ -174,4 +173,9 @@ public Set<String> getMimeTypes() {
public Set<String> getExtensions() {
return Sets.newHashSet("csv");
}

public List<String> getDataSourceIds(Properties properties){
String s = Triplifier.getRootArgument(properties);
return Arrays.asList(s);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,15 @@
import java.util.Map;
import java.util.Properties;

import com.github.sparqlanything.csv.CSVTriplifier;
import com.github.sparqlanything.model.filestream.FileStreamDatasetGraph;
import com.github.sparqlanything.model.filestream.FileStreamManager;
import com.github.sparqlanything.model.filestream.FileStreamTriplifier;
import org.apache.commons.io.FilenameUtils;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.NodeFactory;
import org.apache.jena.graph.Triple;
import org.apache.jena.query.ARQ;
import org.apache.jena.query.DatasetFactory;
import org.apache.jena.tdb2.TDB2Factory;
import org.apache.jena.rdf.model.Model;
Expand Down Expand Up @@ -258,22 +263,34 @@ protected QueryIterator nextStage(Binding binding) {

private DatasetGraph triplify(final Op op, Properties p, Triplifier t) throws IOException {
DatasetGraph dg;
Integer strategy = execCxt.getContext().get(FacadeXOpExecutor.strategy);
if (strategy == null) {

Integer strategy = null;
// Local value for strategy?
String localStrategy = p.getProperty(IRIArgument.STRATEGY.toString());
// Global value for strategy?
Integer globalStrategy = execCxt.getContext().get(FacadeXOpExecutor.strategy);
if(localStrategy != null){
if(globalStrategy!=null){
logger.warn("Local strategy {} overriding global strategy {}", localStrategy, globalStrategy);
}
strategy = Integer.parseInt(localStrategy);
} else if(globalStrategy!=null){
strategy = globalStrategy;
} else{
// Defaul strategy
strategy = 1;
}

URL url = Triplifier.getLocation(p);
String resourceId;
if (url == null) {
// XXX This method of passing content seems only supported by the
// TextTriplifier.
logger.trace("No location, use content: {}", p.getProperty(IRIArgument.CONTENT.toString()));
String id = Integer.toString(p.getProperty(IRIArgument.CONTENT.toString(), "").toString().hashCode());
resourceId = "content:" + id;
} else {
resourceId = url.toString();
}
String resourceId = Triplifier.getResourceId(p);
// if (url == null) {
// // XXX This method of passing content seems only supported by the
// // TextTriplifier.
// logger.trace("No location, use content: {}", p.getProperty(IRIArgument.CONTENT.toString()));
// String id = Integer.toString(p.getProperty(IRIArgument.CONTENT.toString(), "").toString().hashCode());
// resourceId = "content:" + id;
// } else {
// resourceId = url.toString();
// }

// logger.trace("No location, use content: {}",
// p.getProperty(IRIArgument.CONTENT.toString()));
Expand All @@ -282,16 +299,23 @@ private DatasetGraph triplify(final Op op, Properties p, Triplifier t) throws IO

logger.debug("Execution strategy: {} {}", strategy, op.toString());
if (t != null) {
FacadeXGraphBuilder builder;
if (strategy == 1) {
logger.trace("Executing: {} [strategy={}]", p, strategy);
builder = new TripleFilteringFacadeXBuilder(resourceId, op, p);
} else {
logger.trace("Executing: {} [strategy={}]", p, strategy);
builder = new BaseFacadeXBuilder(resourceId, p);
}
try {
dg = t.triplify(p, builder);
if (strategy == 2){
logger.warn("Strategy 2 is experimental!");
// XXX Experimental, Triplifier must implement FileStreamTriplifier
FileStreamManager man = new FileStreamManager(ARQ.getContext(), p, (FileStreamTriplifier) t);
dg = new FileStreamDatasetGraph(man);
} else {
FacadeXGraphBuilder builder;
if (strategy == 1) {
logger.trace("Executing: {} [strategy={}]", p, strategy);
builder = new TripleFilteringFacadeXBuilder(resourceId, op, p);
} else {
logger.trace("Executing: {} [strategy={}]", p, strategy);
builder = new BaseFacadeXBuilder(resourceId, p);
}
dg = t.triplify(p, builder);
}
} catch (TriplifierHTTPException e) {
if (p.getProperty(PROPERTY_OPSERVICE_SILENT).equals("true")) {
// as per https://www.w3.org/TR/sparql11-federated-query/#serviceFailure
Expand All @@ -311,8 +335,8 @@ private DatasetGraph triplify(final Op op, Properties p, Triplifier t) throws IO
dg = DatasetFactory.create().asDatasetGraph();
}

logger.trace("Union graph size {}",dg.getUnionGraph().size());
logger.trace("Default graph size {}", dg.getDefaultGraph().size());
// logger.trace("Union graph size {}",dg.getUnionGraph().size());
// logger.trace("Default graph size {}", dg.getDefaultGraph().size());
return dg;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

package com.github.sparqlanything.html;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
Expand All @@ -32,17 +30,10 @@

import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.jena.ext.com.google.common.collect.Sets;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
//import org.apache.jena.graph.NodeFactory;
//import org.apache.jena.query.DatasetFactory;
//import org.apache.jena.rdf.model.AnonId;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright (c) 2021 SPARQL Anything Contributors @ http://github.com/sparql-anything
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package com.github.sparqlanything.it;

import com.github.sparqlanything.csv.CSVTriplifier;
import com.github.sparqlanything.engine.FacadeX;
import com.github.sparqlanything.model.Triplifier;
import com.github.sparqlanything.model.filestream.FileStreamDatasetGraph;
import com.github.sparqlanything.model.filestream.FileStreamManager;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.NodeFactory;
import org.apache.jena.query.*;
import org.apache.jena.sparql.core.Quad;
import org.apache.jena.sparql.engine.main.QC;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.util.Iterator;
import java.util.Properties;

public class FileStreamTest {
private static Logger log = LoggerFactory.getLogger(FileStreamTest.class);

private Node var(String v){
return NodeFactory.createVariable(v);
}

@Test
public void test() throws URISyntaxException, MalformedURLException {
Properties properties = new Properties();
properties.put("location", getClass().getClassLoader().getResource("test1.csv").toURI().toString());
FileStreamManager man = new FileStreamManager(ARQ.getContext(), properties, new CSVTriplifier());
FileStreamDatasetGraph dg = new FileStreamDatasetGraph(man);
Iterator<Quad> it = dg.findNG(var("g"),var("s"),var("p"),var("o"));
int c=0;
while(it.hasNext()){
c++;
}
Assert.assertTrue(c == 16);
}

@Test
public void testQueryJoin() throws URISyntaxException, MalformedURLException {
Dataset kb = DatasetFactory.createGeneral();
QC.setFactory(ARQ.getContext(), FacadeX.ExecutorFactory);
String location = getClass().getClassLoader().getResource("test1.csv").toURI().toString();
Query query = QueryFactory.create(
"PREFIX fx: <http://sparql.xyz/facade-x/ns/>\n" +
"PREFIX xyz: <http://sparql.xyz/facade-x/data/>\n" +
// "SELECT * WHERE { SERVICE <x-sparql-anything:csv.headers=true,strategy=2,location="
// + location + "> { ?a ?b ?c }} ");
"SELECT ?a ?b ?c ?d WHERE { SERVICE <x-sparql-anything:csv.headers=true,strategy=2,location="
+ location + "> { [] xyz:A ?a ; xyz:B ?b ; xyz:C ?c ; xyz:D ?d . filter(?d != \"\") }} ");
ResultSet rs = QueryExecutionFactory.create(query, kb).execSelect();
int c = 0;
while(rs.hasNext()){
rs.next();
c++;
}
Assert.assertTrue(c == 2);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public enum IRIArgument {

LOCATION("location"), MEDIA_TYPE("media-type"), NAMESPACE("namespace"), ROOT("root"), BLANK_NODES("blank-nodes"),
TRIPLIFIER("triplifier"), CHARSET("charset"), METADATA("metadata"), CONTENT("content"),
FROM_ARCHIVE("from-archive"), TRIM_STRINGS( "trim-strings" ), NULL_STRING( "null-string" );
FROM_ARCHIVE("from-archive"), TRIM_STRINGS( "trim-strings" ), NULL_STRING( "null-string" ), STRATEGY("strategy");

private String s;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ static Charset getCharsetArgument(Properties properties) {
return charset;
}

@Deprecated
static String getRootArgument(Properties properties, URL url) {
if (url != null) {
return getRootArgument(properties, url.toString());
Expand All @@ -106,6 +107,23 @@ static String getRootArgument(Properties properties, URL url) {
}
}

static String getRootArgument(Properties properties) {
try {
return getRootArgument(properties, Triplifier.getLocation(properties));
} catch (MalformedURLException e) {
log.error("Malformed url", e);
return getRootArgument(properties, (String) null);
}
}

/**
* Implementation to be moved to getRootArgument(Properties)
*
* @param properties
* @param url
* @return
*/
@Deprecated
static String getRootArgument(Properties properties, String url) {
if (url != null) {
String root = null;
Expand Down Expand Up @@ -222,6 +240,24 @@ public static InputStream getInputStream(URL url, Properties properties)
return getInputStream(url, properties, getCharsetArgument(properties));
}


public static String getResourceId(Properties properties) {
String resourceId = null;
URL url = null;
try {
url = Triplifier.getLocation(properties);
} catch (MalformedURLException e) {
log.error("Malformed url", e);
}
if (url == null && properties.containsKey(IRIArgument.CONTENT.toString())) {
// XXX This method of passing content seems only supported by the
// TextTriplifier.
log.trace("No location, use content: {}", properties.getProperty(IRIArgument.CONTENT.toString()));
String id = Integer.toString(properties.getProperty(IRIArgument.CONTENT.toString(), "").toString().hashCode());
resourceId = "content:" + id;
}else if(url != null){
resourceId = url.toString();
}
return resourceId;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) 2021 SPARQL Anything Contributors @ http://github.com/sparql-anything
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package com.github.sparqlanything.model.filestream;

import org.apache.jena.sparql.core.Quad;

import java.util.ArrayList;
import java.util.List;

public class FileStreamBuffer {
List<Quad> queue = new ArrayList<Quad>();
boolean completed = false;

boolean isEmpty(){
return queue.isEmpty();
}

void add(Quad quad){
queue.add(quad);
}

boolean isCompleted(){
return completed;
}

boolean isWaiting(){
return isEmpty() && !isCompleted();
}

Quad fetch(){
return queue.remove(0);
}

void setCompleted(){
completed = true;
}
}
Loading

0 comments on commit fae4782

Please sign in to comment.