diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/neamt/AbstractNeamtAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/neamt/AbstractNeamtAnnotator.java new file mode 100644 index 00000000..b8508220 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/annotator/impl/neamt/AbstractNeamtAnnotator.java @@ -0,0 +1,161 @@ +package org.aksw.gerbil.annotator.impl.neamt; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.aksw.gerbil.annotator.http.AbstractHttpBasedAnnotator; +import org.aksw.gerbil.datatypes.ErrorTypes; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.data.DocumentImpl; +import org.aksw.gerbil.transfer.nif.data.NamedEntity; +import org.aksw.gerbil.transfer.nif.data.SpanImpl; +import org.apache.commons.io.IOUtils; +import org.apache.http.HttpEntity; +import org.apache.http.HttpHeaders; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.StringEntity; +import org.apache.http.util.EntityUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; + +/** + * Abstract annotator class for annotation systems that are hosted by NEAMT. + * + * @author Michael Röder (michael.roeder@uni-paderborn.de) + * + */ +public abstract class AbstractNeamtAnnotator extends AbstractHttpBasedAnnotator { + + private static final Logger LOGGER = LoggerFactory.getLogger(AbstractNeamtAnnotator.class); + + private static final String MEDIA_TYPE_STRING = ContentType.create("application/json", StandardCharsets.UTF_8) + .toString(); + + /** + * Service URL. + */ + protected String serviceUrl; + /** + * Component name as defined in the NEAMT service documentation. + */ + protected String components; + /** + * Language tag. + */ + protected String lang; + + public AbstractNeamtAnnotator(String serviceUrl, String components, String lang) { + super(); + this.serviceUrl = serviceUrl; + this.components = components; + this.lang = lang; + } + + protected Document request(Document document) throws GerbilException { + String text = document.getText(); + String documentUri = document.getDocumentURI(); + LOGGER.info("Started request for {}", documentUri); + HttpPost request = null; + try { + request = createPostRequest(serviceUrl); + } catch (Exception e) { + throw new GerbilException("Couldn't create HTTP request.", e, ErrorTypes.UNEXPECTED_EXCEPTION); + } + + JsonObject requestBody = createRequestBody(document); + request.setEntity(new StringEntity(requestBody.toString(), StandardCharsets.UTF_8)); + + request.addHeader(HttpHeaders.CONTENT_TYPE, MEDIA_TYPE_STRING); + request.addHeader(HttpHeaders.ACCEPT, MEDIA_TYPE_STRING); + + HttpEntity entity = null; + CloseableHttpResponse response = null; + Document resultDoc = null; + try { + response = sendRequest(request); + entity = response.getEntity(); + try { + resultDoc = new DocumentImpl(text, documentUri); + String content = IOUtils.toString(entity.getContent()); + JsonObject outJson = new JsonParser().parse(content).getAsJsonObject(); + parseMarkings(outJson, resultDoc); + } catch (Exception e) { + LOGGER.error("Couldn't parse the response.", e); + throw new GerbilException("Couldn't parse the response.", e, ErrorTypes.UNEXPECTED_EXCEPTION); + } + } finally { + closeRequest(request); + if (entity != null) { + try { + EntityUtils.consume(entity); + } catch (IOException e1) { + } + } + IOUtils.closeQuietly(response); + } + LOGGER.info("Finished request for {}", resultDoc.getDocumentURI()); + return resultDoc; + } + + protected JsonObject createRequestBody(Document document) { + JsonObject requestBody = new JsonObject(); + requestBody.addProperty("query", document.getText()); + requestBody.addProperty("components", components); + requestBody.addProperty("full_json", true); + requestBody.addProperty("lang", lang); + return requestBody; + } + + protected void parseMarkings(JsonObject outJson, Document resultDoc) { + if (outJson.has("ent_mentions")) { + JsonElement element = outJson.get("ent_mentions"); + if (element.isJsonArray()) { + JsonArray mentions = element.getAsJsonArray(); + mentions.forEach(m -> parseMarking(m, resultDoc)); + return; + } + } + LOGGER.warn("Couldn't find any mentions in the result \"{}\". It will be ignored.", outJson.toString()); + } + + protected void parseMarking(JsonElement mentionElement, Document resultDoc) { + if (mentionElement.isJsonObject()) { + JsonObject mentionObj = mentionElement.getAsJsonObject(); + // The marking should have start and end + if (mentionObj.has("start") && mentionObj.has("end")) { + int start = mentionObj.get("start").getAsInt(); + int end = mentionObj.get("end").getAsInt(); + String iri = null; + // It may have a link + if (mentionObj.has("link")) { + iri = mentionObj.get("link").getAsString(); + if (iri.isEmpty()) { + iri = null; + } else { + // It is just the Wikidata ID, so we have to add the namespace + iri = "http://www.wikidata.org/entity/" + iri; + } + } + // If we have found no IRI, we have a Span, otherwise a NamedEntity + if (iri == null) { + resultDoc.addMarking(new SpanImpl(start, end - start)); + } else { + resultDoc.addMarking(new NamedEntity(start, end - start, iri)); + } + return; // We can return without problems + } + } + // Something went wrong + LOGGER.warn("Couldn't parse mention \"{}\". It will be ignored.", mentionElement.toString()); + } + +} diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/neamt/NeamtD2KBAnnotator.java b/src/main/java/org/aksw/gerbil/annotator/impl/neamt/NeamtD2KBAnnotator.java new file mode 100644 index 00000000..d5da2b89 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/annotator/impl/neamt/NeamtD2KBAnnotator.java @@ -0,0 +1,45 @@ +package org.aksw.gerbil.annotator.impl.neamt; + +import java.util.List; + +import org.aksw.gerbil.annotator.D2KBAnnotator; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.MeaningSpan; +import org.aksw.gerbil.transfer.nif.Span; + +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; + +public class NeamtD2KBAnnotator extends AbstractNeamtAnnotator implements D2KBAnnotator { + + public NeamtD2KBAnnotator(String serviceUrl, String components, String lang) { + super(serviceUrl, components, lang); + } + + @Override + public List performD2KBTask(Document document) throws GerbilException { + return request(document).getMarkings(MeaningSpan.class); + } + + @Override + protected JsonObject createRequestBody(Document document) { + // Add the entity mentions to the request + String text = document.getText(); + JsonObject requestBody = super.createRequestBody(document); + JsonArray mentions = new JsonArray(); + int start; + int end; + for (Span span : document.getMarkings(Span.class)) { + start = span.getStartPosition(); + end = start + span.getLength(); + JsonObject mention = new JsonObject(); + mention.addProperty("start", start); + mention.addProperty("end", end); + mention.addProperty("surface_form", text.substring(start, end)); + mentions.add(mention); + } + requestBody.add("ent_mentions", mentions); + return requestBody; + } +} diff --git a/src/main/java/org/aksw/gerbil/annotator/impl/neamt/NeamtEntityRecognizer.java b/src/main/java/org/aksw/gerbil/annotator/impl/neamt/NeamtEntityRecognizer.java new file mode 100644 index 00000000..1ffda285 --- /dev/null +++ b/src/main/java/org/aksw/gerbil/annotator/impl/neamt/NeamtEntityRecognizer.java @@ -0,0 +1,21 @@ +package org.aksw.gerbil.annotator.impl.neamt; + +import java.util.List; + +import org.aksw.gerbil.annotator.EntityRecognizer; +import org.aksw.gerbil.exceptions.GerbilException; +import org.aksw.gerbil.transfer.nif.Document; +import org.aksw.gerbil.transfer.nif.Span; + +public class NeamtEntityRecognizer extends AbstractNeamtAnnotator implements EntityRecognizer { + + public NeamtEntityRecognizer(String serviceUrl, String components, String lang) { + super(serviceUrl, components, lang); + } + + @Override + public List performRecognition(Document document) throws GerbilException { + return request(document).getMarkings(Span.class); + } + +} diff --git a/src/main/properties/annotators.properties b/src/main/properties/annotators.properties index e397d085..7068da8c 100644 --- a/src/main/properties/annotators.properties +++ b/src/main/properties/annotators.properties @@ -50,6 +50,13 @@ org.aksw.gerbil.annotators.definition.cetus2.cacheable=true org.aksw.gerbil.annotators.definition.cetus2.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice org.aksw.gerbil.annotators.definition.cetus2.constructorArgs=${org.aksw.gerbil.annotator.cetus2.ServieURL} +### Davlan +org.aksw.gerbil.annotators.definition.Davlan.name=Davlan (NEAMT) +org.aksw.gerbil.annotators.definition.Davlan.experimentType=ERec +org.aksw.gerbil.annotators.definition.Davlan.cacheable=true +org.aksw.gerbil.annotators.definition.Davlan.class=org.aksw.gerbil.annotator.impl.neamt.NeamtEntityRecognizer +org.aksw.gerbil.annotators.definition.Davlan.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, davlan_ner, en + ### DBpedia Spotlight org.aksw.gerbil.annotators.definition.spotlight.name=DBpedia Spotlight org.aksw.gerbil.annotators.definition.spotlight.experimentType=OKE_Task1 @@ -57,7 +64,6 @@ org.aksw.gerbil.annotators.definition.spotlight.cacheable=true org.aksw.gerbil.annotators.definition.spotlight.class=org.aksw.gerbil.annotator.impl.spotlight.SpotlightAnnotator org.aksw.gerbil.annotator.impl.spotlight.SpotlightAnnotator.ServieURL=https://api.dbpedia-spotlight.org/en/ - ### Dexter org.aksw.gerbil.annotators.DexterAnnotator.annotationUrl=http://dexterdemo.isti.cnr.it:8080/dexter-webapp/api/nif/annotate org.aksw.gerbil.annotators.definition.Dexter.name=Dexter @@ -94,6 +100,13 @@ org.aksw.gerbil.annotators.definition.FALCON.cacheable=true org.aksw.gerbil.annotators.definition.FALCON.class=org.aksw.gerbil.annotator.impl.falcon.FALCONAnnotator org.aksw.gerbil.annotators.definition.FALCON.constructorArgs=${org.aksw.gerbil.annotator.FALCON.ServieURL} +### FLAIR +org.aksw.gerbil.annotators.definition.FLAIR.name=Flair (NEAMT) +org.aksw.gerbil.annotators.definition.FLAIR.experimentType=ERec +org.aksw.gerbil.annotators.definition.FLAIR.cacheable=true +org.aksw.gerbil.annotators.definition.FLAIR.class=org.aksw.gerbil.annotator.impl.neamt.NeamtEntityRecognizer +org.aksw.gerbil.annotators.definition.FLAIR.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, flair_ner, en + ### FRED org.aksw.gerbil.annotators.FredAnnotator.serviceUrl=http://wit.istc.cnr.it/stlab-tools/fred org.aksw.gerbil.annotators.definition.fred.name=FRED @@ -168,6 +181,15 @@ org.aksw.gerbil.annotators.definition.kea2.check.args=org.aksw.gerbil.annotators org.aksw.gerbil.annotators.definition.kea2.check.args=org.aksw.gerbil.annotators.KeaAnnotatorConfig.password org.aksw.gerbil.annotators.definition.kea2.constructorArgs=http://${org.aksw.gerbil.annotators.KeaAnnotatorConfig.user}:${org.aksw.gerbil.annotators.KeaAnnotatorConfig.password}@${org.aksw.gerbil.annotators.KeaAnnotatorConfig.disambiguationUrl} +### mGENRE +org.aksw.gerbil.annotators.definition.mGENRE.name=mGENRE (NEAMT) +org.aksw.gerbil.annotators.definition.mGENRE.experimentType=D2KB +org.aksw.gerbil.annotators.definition.mGENRE.cacheable=true +org.aksw.gerbil.annotators.definition.mGENRE.class=org.aksw.gerbil.annotator.impl.neamt.NeamtD2KBAnnotator +org.aksw.gerbil.annotators.definition.mGENRE.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, mgenre_el, en + +### NEAMT +org.aksw.gerbil.annotators.NEAMT.url=http://porque.cs.upb.de/porque-neamt/custom-pipeline ### NERD-ML #NERD endpoint @@ -214,6 +236,13 @@ org.aksw.gerbil.annotators.definition.REL.cacheable=true org.aksw.gerbil.annotators.definition.REL.class=org.aksw.gerbil.annotator.impl.rel.RELAnnotator org.aksw.gerbil.annotators.definition.REL.constructorArgs=${org.aksw.gerbil.annotator.REL.ServieURL} +### Spacy +org.aksw.gerbil.annotators.definition.Spacy.name=Spacy (NEAMT) +org.aksw.gerbil.annotators.definition.Spacy.experimentType=ERec +org.aksw.gerbil.annotators.definition.Spacy.cacheable=true +org.aksw.gerbil.annotators.definition.Spacy.class=org.aksw.gerbil.annotator.impl.neamt.NeamtEntityRecognizer +org.aksw.gerbil.annotators.definition.Spacy.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, spacy_ner, en + ### Tagme org.aksw.gerbil.annotators.TagmeAnnotator.annotateUrl=https://tagme.d4science.org/tagme/tag org.aksw.gerbil.annotators.TagmeAnnotator.spotUrl=https://tagme.d4science.org/tagme/spot @@ -266,3 +295,10 @@ org.aksw.gerbil.annotators.definition.XLisa2.kb=dbpedia org.aksw.gerbil.annotators.definition.XLisa2.model=NER org.aksw.gerbil.annotators.definition.XLisa2.constructorArgs=${org.aksw.gerbil.annotators.definition.XLisa2.lang1}, ${org.aksw.gerbil.annotators.definition.XLisa2.lang2}, ${org.aksw.gerbil.annotators.definition.XLisa2.kb}, ${org.aksw.gerbil.annotators.definition.XLisa2.model} +### WikiNEuRal +org.aksw.gerbil.annotators.definition.WikiNEuRal.name=WikiNEuRal (NEAMT) +org.aksw.gerbil.annotators.definition.WikiNEuRal.experimentType=ERec +org.aksw.gerbil.annotators.definition.WikiNEuRal.cacheable=true +org.aksw.gerbil.annotators.definition.WikiNEuRal.class=org.aksw.gerbil.annotator.impl.neamt.NeamtEntityRecognizer +org.aksw.gerbil.annotators.definition.WikiNEuRal.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, babelscape_ner, en +