Skip to content

Commit

Permalink
Implement #233 also useful to #203
Browse files Browse the repository at this point in the history
  • Loading branch information
enridaga committed Mar 25, 2022
1 parent 07436e7 commit 01cf978
Show file tree
Hide file tree
Showing 13 changed files with 801 additions and 9 deletions.
8 changes: 8 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,14 @@
<version>1.6.3</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.ximpleware/vtd-xml -->
<dependency>
<groupId>com.ximpleware</groupId>
<artifactId>vtd-xml</artifactId>
<version>2.13.4</version>
</dependency>


</dependencies>
</dependencyManagement>

Expand Down
16 changes: 15 additions & 1 deletion sparql-anything-xml/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,24 @@
<artifactId>sparql-anything-model</artifactId>
<version>${project.version}</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.ximpleware/vtd-xml -->
<dependency>
<groupId>com.ximpleware</groupId>
<artifactId>vtd-xml</artifactId>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.1</version>
<scope>test</scope>
</dependency>


<dependency>
<groupId>com.github.sparqlanything</groupId>
<artifactId>sparql-anything-testutils</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,20 @@
package com.github.sparqlanything.xml;

import com.github.sparqlanything.model.FacadeXGraphBuilder;
import com.github.sparqlanything.model.Slice;
import com.github.sparqlanything.model.Slicer;
import com.github.sparqlanything.model.Triplifier;
import com.github.sparqlanything.model.TriplifierHTTPException;
import com.ximpleware.AutoPilot;
import com.ximpleware.EncodingException;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.jena.ext.com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -44,20 +56,136 @@
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

public class XMLTriplifier implements Triplifier {
public class XMLTriplifier implements Triplifier, Slicer {

private static final Logger log = LoggerFactory.getLogger(XMLTriplifier.class);

@Override
public void triplify(Properties properties, FacadeXGraphBuilder builder) throws IOException, TriplifierHTTPException {
// URL url = Triplifier.getLocation(properties);
//
// if (url == null)
// return;
public void transformWithXPath(List<String> xpaths, Properties properties, FacadeXGraphBuilder builder) throws IOException, TriplifierHTTPException {

String namespace = Triplifier.getNamespaceArgument(properties);
String dataSourceId = Triplifier.getRootArgument(properties);
String root = Triplifier.getRootArgument(properties);

builder.addRoot(dataSourceId, root);
try {
VTDGen vg = new VTDGen();
byte[] bytes = IOUtils.toByteArray(Triplifier.getInputStream(properties));
vg.setDoc(bytes);
vg.parse(false);
VTDNav vn = vg.getNav();
Iterator<String> xit = xpaths.iterator();
while(xit.hasNext()) {
String xpath = xit.next();
log.debug("Evaluating XPath: {}", xpath);
// vg.parse(false); // set namespace awareness to true
AutoPilot ap = new AutoPilot(vn);
//ap.declareXPathNameSpace("ns1","http://purl.org/dc/elements/1.1/");
ap.selectXPath(xpath);
int result = -1;
int count = 1;
while ((result = ap.evalXPath()) != -1) {
transformFromXPath(vn, result, count, root, dataSourceId, properties, builder);
count++;
}
log.debug("XPath: {} matches", count);
}

} catch (XPathEvalException | NavException | ParseException | XPathParseException e){
log.error("Error while evaluating XPath expression");
throw new IOException(e);
}
}

public int transformFromXPath(VTDNav vn, int result, int child, String parentId, String dataSourceId, Properties properties, FacadeXGraphBuilder builder) throws NavException {
log.trace(" -- index: {} type: {}", result, vn.getTokenType(result));
switch (vn.getTokenType(result)) {
case VTDNav.TOKEN_STARTING_TAG:
String tag = vn.toString(result);
log.trace(" -- tag: {} ", tag);
String childId = String.join("", parentId , "/" , Integer.toString(child), ":", tag);
builder.addContainer(dataSourceId, parentId, child, childId);

// Attributes
int attrCount = vn.getAttrCount();
log.trace(" -- attr count: {}", attrCount);
int increment = 0;
if (attrCount > 0) {
for (int i = result + 1; i <= result + attrCount; i += 2) {
// Not sure why but sometime attrCount is not reliable
if(vn.getTokenType(i) != VTDNav.TOKEN_ATTR_NAME){
break;
}
String key = vn.toString(i);
String value = vn.toString(i + 1);
log.trace(" -- attr: {} = {}", key, value);
builder.addValue(dataSourceId, childId, key, value);
increment += 2;
}
}
// Get the text
int t = vn.getText(); // get the index of the text (char data or CDATA)
if (t != -1) {
String text = vn.toNormalizedString(t);
log.trace(" -- text: {}", text);
builder.addValue(dataSourceId, childId, 1, text);
}

// Iterate on Children until complete
int tokenDepth = vn.getTokenDepth(result);
int index = result + increment;
int childc = 1;
while(true){
index++;
int type = vn.getTokenType(index);
String s = vn.toString(index);
int d = vn.getTokenDepth(index);
// If type is element and depth is not greater than tokenDepth, break!
if((type == VTDNav.TOKEN_STARTING_TAG && d <=tokenDepth) || (type == VTDNav.TOKEN_STARTING_TAG && s.equals(""))){
break;
}
log.trace( " ... index: {} depth: {} type: {} string: {}", index, d, type, s);
index = transformFromXPath(vn, index, childc, childId, dataSourceId, properties, builder);
childc++;
}
return index - 1;
case VTDNav.TOKEN_ATTR_NAME:
// Attribute
String name = vn.toString(result);
String value = vn.toString(result + 1);
log.trace("Attribute {} = {}", name, value);
String attrChildId = String.join("", parentId , "/" , Integer.toString(child), ":", name);
builder.addContainer(dataSourceId, parentId, child, attrChildId);
builder.addValue(dataSourceId, attrChildId, name, value);
return result + 1;
case VTDNav.TOKEN_ATTR_VAL:
// Attribute value
log.trace("Attribute value: {}", vn.toString(result));
builder.addValue(dataSourceId, parentId, child, vn.toString(result));
break;
case VTDNav.TOKEN_CHARACTER_DATA:
// Text
String text = vn.toNormalizedString(result);
log.trace("Text: {}", text);
builder.addValue(dataSourceId, parentId, child, vn.toString(result));
break;
case VTDNav.TOKEN_DEC_ATTR_NAME:
log.trace("Attribute (dec): {} = {}", vn.toString(result), vn.toString(result + 1));
return result + 1;
case VTDNav.TOKEN_DEC_ATTR_VAL:
log.trace("Attribute value (dec) ", vn.toString(result));
break;
default:
log.warn("Ignored event: {} {}", vn.getTokenType(result), vn.toString(result));
}
return result;
}

public void transformSAX(Properties properties, FacadeXGraphBuilder builder) throws IOException, TriplifierHTTPException {

String namespace = Triplifier.getNamespaceArgument(properties);
String dataSourceId = Triplifier.getRootArgument(properties);
Expand Down Expand Up @@ -137,7 +265,7 @@ public void triplify(Properties properties, FacadeXGraphBuilder builder) throws
log.trace("element open: {} [{}]", path, stack.size());

// XXX Create an RDF resource
String resourceId = path.substring(1);
String resourceId = StringUtils.join("", root, path);
// If this is the root
if (isRoot) {
// Add type root
Expand Down Expand Up @@ -203,4 +331,25 @@ public Set<String> getMimeTypes() {
public Set<String> getExtensions() {
return Sets.newHashSet("xml");
}


@Override
public void triplify(Properties properties, FacadeXGraphBuilder builder) throws IOException, TriplifierHTTPException {
List<String> xpaths = Triplifier.getPropertyValues(properties, "xml.path");
if(!xpaths.isEmpty()){
transformWithXPath(xpaths, properties, builder);
}else{
transformSAX(properties, builder);
}
}

@Override
public Iterable<Slice> slice(Properties p) throws IOException, TriplifierHTTPException {
return null;
}

@Override
public void triplify(Slice slice, Properties p, FacadeXGraphBuilder builder) {

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright (c) 2022 SPARQL Anything Contributors @ http://github.com/sparql-anything
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package com.github.sparqlanything.xml;

import com.github.sparqlanything.testutils.AbstractTriplifierTester;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.junit.Test;

import java.util.Properties;

public class MoreXMLTriplifierTest extends AbstractTriplifierTester {

public MoreXMLTriplifierTest() {
super(new XMLTriplifier(), new Properties(), "xml", "ttl");
}

@Override
protected void properties(Properties properties) {
if(name.getMethodName().equals("testSimple$1")){
properties.put("blank-nodes", "false");
}else
if(name.getMethodName().equals("testBooks$1")){
properties.put("blank-nodes", "false");
}else
if(name.getMethodName().equals("testBooks_1$1")){
properties.put("blank-nodes", "false");
properties.put("xml.path", "//book");
}
}

@Test
public void testSimple$1(){
//RDFDataMgr.write(System.err, result, Lang.TTL);
assertResultIsIsomorphicWithExpected();
}

@Test
public void testBooks$1(){
// RDFDataMgr.write(System.err, result, Lang.TTL);
assertResultIsIsomorphicWithExpected();
}

@Test
public void testBooks_1$1(){
RDFDataMgr.write(System.err, result, Lang.TTL);
assertResultIsIsomorphicWithExpected();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,15 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Iterator;
import java.util.Properties;
Expand Down
Loading

0 comments on commit 01cf978

Please sign in to comment.