From d22a1ff38901854ca8022f7b5b5086554a110a61 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 26 Aug 2022 16:20:40 +0900 Subject: [PATCH] Compile the xpath at the instantiation of the patterns --- .../evaluation/EndToEndEvaluation.java | 64 ++++---- .../utilities/FieldSpecification.java | 154 +++++++++--------- 2 files changed, 112 insertions(+), 106 deletions(-) diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/EndToEndEvaluation.java b/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/EndToEndEvaluation.java index b17e3a1ff8..c71943ebf3 100644 --- a/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/EndToEndEvaluation.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/EndToEndEvaluation.java @@ -28,14 +28,10 @@ import org.w3c.dom.*; import javax.xml.namespace.QName; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathExpressionException; -import javax.xml.xpath.XPathFactory; +import javax.xml.xpath.*; import javax.xml.parsers.*; import org.xml.sax.*; -import javax.xml.xpath.XPathConstants; - import com.rockymadden.stringmetric.similarity.RatcliffObershelpMetric; import scala.Option; @@ -153,10 +149,14 @@ public EndToEndEvaluation(String path, String inType) { headerLabels = new ArrayList<>(); fulltextLabels = new ArrayList<>(); citationsLabels = new ArrayList<>(); - - FieldSpecification.setUpFields(headerFields, fulltextFields, citationsFields, - headerLabels, fulltextLabels, citationsLabels); - } + + try { + FieldSpecification.setUpFields(headerFields, fulltextFields, citationsFields, + headerLabels, fulltextLabels, citationsLabels); + } catch (XPathExpressionException e) { + throw new RuntimeException("Invalid XPaths for evaluation. ", e); + } + } public String evaluationGrobid(boolean forceRun, StringBuilder reportMD) throws Exception { if (xmlInputPath == null) { @@ -530,27 +530,27 @@ public boolean accept(File dir, String name) { XPathFactory xpf = XPathFactory.newInstance(); XPath xp = xpf.newXPath(); - HashMap map = new HashMap(); +// Map map = new HashMap(); // explicit indication of the default namespace - map.put("tei", "http://www.tei-c.org/ns/1.0"); +// map.put("tei", "http://www.tei-c.org/ns/1.0"); - Map mappings = new HashMap(); + Map mappings = new HashMap<>(); mappings.put("tei", "http://www.tei-c.org/ns/1.0"); xp.setNamespaceContext(new NamespaceContextMap(mappings)); - if (sectionType == this.CITATION) { + if (sectionType == CITATION) { // we start by identifying each expected citation // the first FieldSpecification object for the citation is the base path for // each citation structure in the corresponding XML FieldSpecification base = fields.get(0); - String path = null; + XPathExpression path = null; if (inputType.equals("nlm")) path = base.nlmPath.get(0).getLeft(); else path = base.grobidPath.get(0).getLeft(); - NodeList nodeList = (NodeList) xp.compile(path). + NodeList nodeList = (NodeList) path. evaluate(gold.getDocumentElement(), XPathConstants.NODESET); int nbCitationsGold = nodeList.getLength(); totalExpectedInstances += nbCitationsGold; @@ -588,7 +588,7 @@ public boolean accept(File dir, String name) { //p++; continue; } - List> subpaths = null; + List> subpaths = null; if (inputType.equals("nlm")) { subpaths = field.nlmPath; } else if (inputType.equals("tei")) { @@ -598,8 +598,8 @@ public boolean accept(File dir, String name) { if (subpaths == null) continue; - for(Pair subpath : subpaths) { - NodeList nodeList2 = (NodeList) xp.compile(subpath.getLeft()). + for(Pair subpath : subpaths) { + NodeList nodeList2 = (NodeList) subpath.getLeft(). evaluate(node, subpath.getRight()); List goldResults = new ArrayList(); @@ -704,7 +704,7 @@ public boolean accept(File dir, String name) { * - third rule: matching of "soft" inTitle (title of Journal or Conference), volume and first page * - forth rule: matching of first author last name and title, or inTitle if title is empty */ - String signature1 = null; + String signature1 = null; if ( (goldTitleSoft.length()>0) && (goldDate.length()>0) ) { signature1 = goldTitleSoft + goldDate; //signature1 = signature1.replaceAll("[^\\x00-\\x7F]", ""); @@ -742,7 +742,7 @@ public boolean accept(File dir, String name) { // get the Grobid citations path = base.grobidPath.get(0).getLeft(); QName nodeType = base.grobidPath.get(0).getRight(); - nodeList = (NodeList) xp.compile(path). + nodeList = (NodeList) path. evaluate(tei.getDocumentElement(), nodeType); int nbCitationsGrobid = nodeList.getLength(); totalObservedInstances += nbCitationsGrobid; @@ -758,8 +758,8 @@ public boolean accept(File dir, String name) { //p++; continue; } - for(Pair subpath : field.grobidPath) { - NodeList nodeList2 = (NodeList) xp.compile(subpath.getLeft()). + for(Pair subpath : field.grobidPath) { + NodeList nodeList2 = (NodeList) subpath.getLeft(). evaluate(node, subpath.getRight()); List grobidResults = new ArrayList(); for (int j = 0; j < nodeList2.getLength(); j++) { @@ -1239,8 +1239,8 @@ else if (goldResultSoft.length() > 0) { List grobidResults = new ArrayList<>(); int nbGrobidResults = 0; - for(Pair path : field.grobidPath) { - NodeList nodeList = (NodeList) xp.compile(path.getLeft()). + for(Pair path : field.grobidPath) { + NodeList nodeList = (NodeList) path.getLeft(). evaluate(tei.getDocumentElement(), path.getRight()); nbGrobidResults = nodeList.getLength(); for (int i = 0; i < nodeList.getLength(); i++) { @@ -1262,7 +1262,7 @@ else if (goldResultSoft.length() > 0) { List goldResults = new ArrayList(); int nbGoldResults = 0; - List> subpaths = null; + List> subpaths = null; if (inputType.equals("nlm")) { subpaths = field.nlmPath; } else if (inputType.equals("tei")) { @@ -1272,8 +1272,8 @@ else if (goldResultSoft.length() > 0) { if (subpaths == null) continue; - for(Pair path : subpaths) { - NodeList nodeList = (NodeList) xp.compile(path.getLeft()). + for(Pair path : subpaths) { + NodeList nodeList = (NodeList) path.getLeft(). evaluate(gold.getDocumentElement(), path.getRight()); //System.out.println(path + ": " + nodeList.getLength() + " nodes"); nbGoldResults = nodeList.getLength(); @@ -1462,7 +1462,7 @@ else if (sectionType == this.FULLTEXT) { // List goldResults = new ArrayList<>(); // int nbgoldResults = 0; - List> subpaths = null; + List> subpaths = null; if (inputType.equals("nlm")) { subpaths = field.nlmPath; } else if (inputType.equals("tei")) { @@ -1841,16 +1841,16 @@ else if (sectionType == this.HEADER) { return report.toString(); } - private static List extractFromXPath(Document xmlDocument, List> extractionPaths, XPath xPath, FieldSpecification field) throws XPathExpressionException { + private static List extractFromXPath(Document xmlDocument, List> extractionPaths, XPath xPath, FieldSpecification field) throws XPathExpressionException { List results = new ArrayList<>(); - for(Pair path : extractionPaths) { + for(Pair path : extractionPaths) { if (path.getRight() == XPathConstants.NODESET) { - NodeList nodeList = (NodeList) xPath.compile(path.getLeft()).evaluate(xmlDocument.getDocumentElement(), path.getRight()); + NodeList nodeList = (NodeList) path.getLeft().evaluate(xmlDocument.getDocumentElement(), path.getRight()); for (int i = 0; i < nodeList.getLength(); i++) { results.add(basicNormalizationFullText(nodeList.item(i).getNodeValue(), field.fieldName)); } } else if (path.getRight() == XPathConstants.STRING) { - String string = (String) xPath.compile(path.getLeft()).evaluate(xmlDocument, path.getRight()); + String string = (String) path.getLeft().evaluate(xmlDocument, path.getRight()); results.add(basicNormalizationFullText(string, field.fieldName)); } else { throw new UnsupportedOperationException("Extraction from XPath works only with STRING or NODESET. Used: " + path.getRight().toString()); diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/utilities/FieldSpecification.java b/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/utilities/FieldSpecification.java index 428a8e6c7d..580c1352d9 100644 --- a/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/utilities/FieldSpecification.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/utilities/FieldSpecification.java @@ -3,8 +3,7 @@ import org.apache.commons.lang3.tuple.Pair; import javax.xml.namespace.QName; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.*; import java.util.*; /** @@ -15,8 +14,8 @@ public class FieldSpecification { public String fieldName = null; - public List> nlmPath = new ArrayList<>(); - public List> grobidPath = new ArrayList<>(); + public List> nlmPath = new ArrayList<>(); + public List> grobidPath = new ArrayList<>(); public List pdfxPath = new ArrayList<>(); public List cerminePath = new ArrayList<>(); @@ -31,15 +30,18 @@ public static void setUpFields(List headerFields, List citationsFields, List headerLabels, List fulltextLabels, - List citationsLabels) { - // header + List citationsLabels) throws XPathExpressionException { + + XPath xPath = XPathFactory.newInstance().newXPath(); + + // header // title FieldSpecification titleField = new FieldSpecification(); titleField.fieldName = "title"; titleField.isTextual = true; - titleField.grobidPath.add(Pair.of("//titleStmt/title/text()", XPathConstants.NODESET)); - titleField.nlmPath.add(Pair.of("/article/front/article-meta/title-group/article-title//text()", XPathConstants.NODESET)); + titleField.grobidPath.add(Pair.of(xPath.compile("//titleStmt/title/text()"), XPathConstants.NODESET)); + titleField.nlmPath.add(Pair.of(xPath.compile("/article/front/article-meta/title-group/article-title//text()"), XPathConstants.NODESET)); titleField.pdfxPath.add("/pdfx/article/front/title-group/article-title/text()"); headerFields.add(titleField); headerLabels.add("title"); @@ -54,11 +56,11 @@ public static void setUpFields(List headerFields, authorField.grobidPath. add("//sourceDesc/biblStruct/analytic/author/persName/forename[@type=\"middle\"]");*/ authorField.grobidPath. - add(Pair.of("//sourceDesc/biblStruct/analytic/author/persName/surname/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("//sourceDesc/biblStruct/analytic/author/persName/surname/text()"), XPathConstants.NODESET)); //authorField.nlmPath. // add("/article/front/article-meta/contrib-group/contrib[@contrib-type=\"author\"]/name/given-names"); authorField.nlmPath. - add(Pair.of("/article/front/article-meta/contrib-group/contrib[@contrib-type=\"author\"]/name/surname/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("/article/front/article-meta/contrib-group/contrib[@contrib-type=\"author\"]/name/surname/text()"), XPathConstants.NODESET)); authorField.pdfxPath.add("/pdfx/article/front/contrib-group/contrib[@contrib-type=\"author\"]/name/text()"); headerFields.add(authorField); headerLabels.add("authors"); @@ -72,11 +74,11 @@ public static void setUpFields(List headerFields, firstAuthorField.grobidPath .add("//sourceDesc/biblStruct/analytic/author/persName/forename[@type=\"middle\"]");*/ firstAuthorField.grobidPath - .add(Pair.of("//sourceDesc/biblStruct/analytic/author[1]/persName/surname/text()", XPathConstants.NODESET)); + .add(Pair.of(xPath.compile("//sourceDesc/biblStruct/analytic/author[1]/persName/surname/text()"), XPathConstants.NODESET)); //firstAuthorField.nlmPath // .add("/article/front/article-meta/contrib-group/contrib[@contrib-type=\"author\"]/name/given-names"); firstAuthorField.nlmPath - .add(Pair.of("/article/front/article-meta/contrib-group/contrib[@contrib-type=\"author\"][1]/name/surname/text()", XPathConstants.NODESET)); + .add(Pair.of(xPath.compile("/article/front/article-meta/contrib-group/contrib[@contrib-type=\"author\"][1]/name/surname/text()"), XPathConstants.NODESET)); firstAuthorField.pdfxPath .add("/pdfx/article/front/contrib-group/contrib[@contrib-type=\"author\"][1]/name/text()"); headerFields.add(firstAuthorField); @@ -87,8 +89,8 @@ public static void setUpFields(List headerFields, affiliationField.fieldName = "affiliations"; affiliationField.isTextual = true; //affiliationField.hasMultipleValue = true; - affiliationField.grobidPath.add(Pair.of("//sourceDesc/biblStruct/analytic/author/affiliation/orgName/text()", XPathConstants.NODESET)); - affiliationField.nlmPath.add(Pair.of("/article/front/article-meta/contrib-group/aff/text()", XPathConstants.NODESET)); + affiliationField.grobidPath.add(Pair.of(xPath.compile("//sourceDesc/biblStruct/analytic/author/affiliation/orgName/text()"), XPathConstants.NODESET)); + affiliationField.nlmPath.add(Pair.of(xPath.compile("/article/front/article-meta/contrib-group/aff/text()"), XPathConstants.NODESET)); affiliationField.pdfxPath.add("/pdfx/article/front/contrib-group"); //headerFields.add(affiliationField); //headerLabels.add("affiliations"); @@ -97,9 +99,9 @@ public static void setUpFields(List headerFields, FieldSpecification dateField = new FieldSpecification(); dateField.fieldName = "date"; dateField.grobidPath. - add(Pair.of("//publicationStmt/date[1]/@when", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("//publicationStmt/date[1]/@when"), XPathConstants.NODESET)); dateField.nlmPath. - add(Pair.of("/article/front/article-meta/pub-date[@pub-type=\"pmc-release\"][1]//text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("/article/front/article-meta/pub-date[@pub-type=\"pmc-release\"][1]//text()"), XPathConstants.NODESET)); //in bioRxiv: 2014 //headerFields.add(dateField); //headerLabels.add("date"); @@ -109,9 +111,9 @@ public static void setUpFields(List headerFields, abstractField.fieldName = "abstract"; abstractField.isTextual = true; abstractField.grobidPath. - add(Pair.of("//profileDesc/abstract//text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("//profileDesc/abstract//text()"), XPathConstants.NODESET)); abstractField.nlmPath. - add(Pair.of("/article/front/article-meta/abstract//text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("/article/front/article-meta/abstract//text()"), XPathConstants.NODESET)); headerFields.add(abstractField); headerLabels.add("abstract"); @@ -120,9 +122,9 @@ public static void setUpFields(List headerFields, keywordsField.fieldName = "keywords"; keywordsField.isTextual = true; keywordsField.grobidPath. - add(Pair.of("//profileDesc/textClass/keywords//text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("//profileDesc/textClass/keywords//text()"), XPathConstants.NODESET)); keywordsField.nlmPath. - add(Pair.of("/article/front/article-meta/kwd-group/kwd/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("/article/front/article-meta/kwd-group/kwd/text()"), XPathConstants.NODESET)); headerFields.add(keywordsField); headerLabels.add("keywords"); @@ -130,9 +132,9 @@ public static void setUpFields(List headerFields, FieldSpecification doiField = new FieldSpecification(); doiField.fieldName = "doi"; doiField.grobidPath. - add(Pair.of("//sourceDesc/biblStruct/idno[@type=\"DOI\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("//sourceDesc/biblStruct/idno[@type=\"DOI\"]/text()"), XPathConstants.NODESET)); doiField.nlmPath. - add(Pair.of("/article/front/article-meta/article-id[@pub-id-type=\"doi\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("/article/front/article-meta/article-id[@pub-id-type=\"doi\"]/text()"), XPathConstants.NODESET)); //headerFields.add(doiField); //headerLabels.add("doi"); @@ -141,8 +143,8 @@ public static void setUpFields(List headerFields, // the first field gives the base path for each citation structure FieldSpecification baseCitation = new FieldSpecification(); baseCitation.fieldName = "base"; - baseCitation.grobidPath.add(Pair.of("//back/div/listBibl/biblStruct", XPathConstants.NODESET)); - baseCitation.nlmPath.add(Pair.of("//ref-list/ref", XPathConstants.NODESET)); // note: sometimes we just have the raw citation bellow this! + baseCitation.grobidPath.add(Pair.of(xPath.compile("//back/div/listBibl/biblStruct"), XPathConstants.NODESET)); + baseCitation.nlmPath.add(Pair.of(xPath.compile("//ref-list/ref"), XPathConstants.NODESET)); // note: sometimes we just have the raw citation bellow this! baseCitation.pdfxPath.add("//ref-list/ref"); // note: there is nothing beyond that in pdfx xml results! citationsFields.add(baseCitation); // the rest of the citation fields are relative to the base path @@ -151,8 +153,8 @@ public static void setUpFields(List headerFields, FieldSpecification titleField2 = new FieldSpecification(); titleField2.fieldName = "title"; titleField2.isTextual = true; - titleField2.grobidPath.add(Pair.of("analytic/title/text()", XPathConstants.NODESET)); - titleField2.nlmPath.add(Pair.of("*/article-title//text()", XPathConstants.NODESET)); + titleField2.grobidPath.add(Pair.of(xPath.compile("analytic/title/text()"), XPathConstants.NODESET)); + titleField2.nlmPath.add(Pair.of(xPath.compile("*/article-title//text()"), XPathConstants.NODESET)); citationsFields.add(titleField2); citationsLabels.add("title"); @@ -160,10 +162,10 @@ public static void setUpFields(List headerFields, FieldSpecification authorField2 = new FieldSpecification(); authorField2.fieldName = "authors"; authorField2.isTextual = true; - authorField2.grobidPath.add(Pair.of("analytic/author/persName/surname/text()", XPathConstants.NODESET)); - authorField2.nlmPath.add(Pair.of("*//surname[parent::name|parent::string-name]/text()", XPathConstants.NODESET)); - //authorField2.nlmPath.add(Pair.of("*//name/surname/text()", XPathConstants.NODESET)); - //authorField2.nlmPath.add(Pair.of("*//string-name/surname/text()", XPathConstants.NODESET)); + authorField2.grobidPath.add(Pair.of(xPath.compile("analytic/author/persName/surname/text()"), XPathConstants.NODESET)); + authorField2.nlmPath.add(Pair.of(xPath.compile("*//surname[parent::name|parent::string-name]/text()"), XPathConstants.NODESET)); + //authorField2.nlmPath.add(Pair.of(xPath.compile("*//name/surname/text()"), XPathConstants.NODESET)); + //authorField2.nlmPath.add(Pair.of(xPath.compile("*//string-name/surname/text()"), XPathConstants.NODESET)); citationsFields.add(authorField2); citationsLabels.add("authors"); @@ -171,18 +173,18 @@ public static void setUpFields(List headerFields, FieldSpecification firstAuthorField2 = new FieldSpecification(); firstAuthorField2.fieldName = "first_author"; firstAuthorField2.isTextual = true; - firstAuthorField2.grobidPath.add(Pair.of("analytic/author[1]/persName/surname/text()", XPathConstants.NODESET)); - //firstAuthorField2.nlmPath.add(Pair.of("*//surname[parent::name|parent::string-name][1]/text()", XPathConstants.NODESET)); - firstAuthorField2.nlmPath.add(Pair.of("*//name[1]/surname/text()", XPathConstants.NODESET)); - firstAuthorField2.nlmPath.add(Pair.of("*//string-name[1]/surname/text()", XPathConstants.NODESET)); + firstAuthorField2.grobidPath.add(Pair.of(xPath.compile("analytic/author[1]/persName/surname/text()"), XPathConstants.NODESET)); + //firstAuthorField2.nlmPath.add(Pair.of(xPath.compile("*//surname[parent::name|parent::string-name][1]/text()"), XPathConstants.NODESET)); + firstAuthorField2.nlmPath.add(Pair.of(xPath.compile("*//name[1]/surname/text()"), XPathConstants.NODESET)); + firstAuthorField2.nlmPath.add(Pair.of(xPath.compile("*//string-name[1]/surname/text()"), XPathConstants.NODESET)); citationsFields.add(firstAuthorField2); citationsLabels.add("first_author"); // date FieldSpecification dateField2 = new FieldSpecification(); dateField2.fieldName = "date"; - dateField2.grobidPath.add(Pair.of("monogr/imprint/date/@when", XPathConstants.NODESET)); - dateField2.nlmPath.add(Pair.of("*/year/text()", XPathConstants.NODESET)); + dateField2.grobidPath.add(Pair.of(xPath.compile("monogr/imprint/date/@when"), XPathConstants.NODESET)); + dateField2.nlmPath.add(Pair.of(xPath.compile("*/year/text()"), XPathConstants.NODESET)); citationsFields.add(dateField2); citationsLabels.add("date"); @@ -190,8 +192,8 @@ public static void setUpFields(List headerFields, FieldSpecification inTitleField2 = new FieldSpecification(); inTitleField2.fieldName = "inTitle"; inTitleField2.isTextual = true; - inTitleField2.grobidPath.add(Pair.of("monogr/title/text()", XPathConstants.NODESET)); - inTitleField2.nlmPath.add(Pair.of("*/source/text()", XPathConstants.NODESET)); + inTitleField2.grobidPath.add(Pair.of(xPath.compile("monogr/title/text()"), XPathConstants.NODESET)); + inTitleField2.nlmPath.add(Pair.of(xPath.compile("*/source/text()"), XPathConstants.NODESET)); citationsFields.add(inTitleField2); citationsLabels.add("inTitle"); @@ -199,9 +201,9 @@ public static void setUpFields(List headerFields, FieldSpecification volumeField = new FieldSpecification(); volumeField.fieldName = "volume"; volumeField.grobidPath. - add(Pair.of("monogr/imprint/biblScope[@unit=\"volume\" or @unit=\"vol\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("monogr/imprint/biblScope[@unit=\"volume\" or @unit=\"vol\"]/text()"), XPathConstants.NODESET)); volumeField.nlmPath. - add(Pair.of("*/volume/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("*/volume/text()"), XPathConstants.NODESET)); citationsFields.add(volumeField); citationsLabels.add("volume"); @@ -209,9 +211,9 @@ public static void setUpFields(List headerFields, FieldSpecification issueField = new FieldSpecification(); issueField.fieldName = "issue"; issueField.grobidPath. - add(Pair.of("monogr/imprint/biblScope[@unit=\"issue\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("monogr/imprint/biblScope[@unit=\"issue\"]/text()"), XPathConstants.NODESET)); issueField.nlmPath. - add(Pair.of("*/issue/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("*/issue/text()"), XPathConstants.NODESET)); citationsFields.add(issueField); citationsLabels.add("issue"); @@ -219,9 +221,9 @@ public static void setUpFields(List headerFields, FieldSpecification pageField = new FieldSpecification(); pageField.fieldName = "page"; pageField.grobidPath. - add(Pair.of("monogr/imprint/biblScope[@unit=\"page\"]/@from", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("monogr/imprint/biblScope[@unit=\"page\"]/@from"), XPathConstants.NODESET)); pageField.nlmPath. - add(Pair.of("*/fpage/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("*/fpage/text()"), XPathConstants.NODESET)); citationsFields.add(pageField); citationsLabels.add("page"); @@ -230,9 +232,9 @@ public static void setUpFields(List headerFields, publisherField.fieldName = "publisher"; publisherField.isTextual = true; publisherField.grobidPath. - add(Pair.of("monogr/imprint/publisher/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("monogr/imprint/publisher/text()"), XPathConstants.NODESET)); publisherField.nlmPath. - add(Pair.of("*/publisher-name/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("*/publisher-name/text()"), XPathConstants.NODESET)); //citationsFields.add(publisherField); //citationsLabels.add("publisher"); @@ -240,8 +242,8 @@ public static void setUpFields(List headerFields, FieldSpecification citationIdField = new FieldSpecification(); citationIdField.fieldName = "id"; citationIdField.isTextual = true; - citationIdField.grobidPath.add(Pair.of("@id", XPathConstants.NODESET)); - citationIdField.nlmPath.add(Pair.of("@id", XPathConstants.NODESET)); + citationIdField.grobidPath.add(Pair.of(xPath.compile("@id"), XPathConstants.NODESET)); + citationIdField.nlmPath.add(Pair.of(xPath.compile("@id"), XPathConstants.NODESET)); citationsFields.add(citationIdField); citationsLabels.add("id"); @@ -250,9 +252,9 @@ public static void setUpFields(List headerFields, citationDOIField.fieldName = "doi"; citationDOIField.isTextual = true; citationDOIField.grobidPath. - add(Pair.of("analytic/idno[@type=\"DOI\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("analytic/idno[@type=\"DOI\"]/text()"), XPathConstants.NODESET)); citationDOIField.nlmPath. - add(Pair.of("*/pub-id[@pub-id-type=\"doi\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("*/pub-id[@pub-id-type=\"doi\"]/text()"), XPathConstants.NODESET)); citationsFields.add(citationDOIField); citationsLabels.add("doi"); @@ -261,9 +263,9 @@ public static void setUpFields(List headerFields, citationPMIDField.fieldName = "pmid"; citationPMIDField.isTextual = true; citationPMIDField.grobidPath. - add(Pair.of("analytic/idno[@type=\"PMID\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("analytic/idno[@type=\"PMID\"]/text()"), XPathConstants.NODESET)); citationPMIDField.nlmPath. - add(Pair.of("*/pub-id[@pub-id-type=\"pmid\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("*/pub-id[@pub-id-type=\"pmid\"]/text()"), XPathConstants.NODESET)); citationsFields.add(citationPMIDField); citationsLabels.add("pmid"); @@ -272,9 +274,9 @@ public static void setUpFields(List headerFields, citationPMCIDField.fieldName = "pmcid"; citationPMCIDField.isTextual = true; citationPMCIDField.grobidPath. - add(Pair.of("analytic/idno[@type=\"PMCID\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("analytic/idno[@type=\"PMCID\"]/text()"), XPathConstants.NODESET)); citationPMCIDField.nlmPath. - add(Pair.of("*/pub-id[@pub-id-type=\"pmcid\"]/text()", XPathConstants.NODESET)); + add(Pair.of(xPath.compile("*/pub-id[@pub-id-type=\"pmcid\"]/text()"), XPathConstants.NODESET)); citationsFields.add(citationPMCIDField); citationsLabels.add("pmcid"); @@ -295,40 +297,40 @@ public static void setUpFields(List headerFields, sectionTitleField.isTextual = true; //LF: added //text() at the end instead of /text() so that possible child nodes are also included in the xpath - sectionTitleField.grobidPath.add(Pair.of("//text/body/div/head//text()", XPathConstants.NODESET)); - sectionTitleField.nlmPath.add(Pair.of("//body//sec/title//text()", XPathConstants.NODESET)); + sectionTitleField.grobidPath.add(Pair.of(xPath.compile("//text/body/div/head//text()"), XPathConstants.NODESET)); + sectionTitleField.nlmPath.add(Pair.of(xPath.compile("//body//sec/title//text()"), XPathConstants.NODESET)); fulltextFields.add(sectionTitleField); fulltextLabels.add("section_title"); FieldSpecification referenceMarkerField = new FieldSpecification(); referenceMarkerField.fieldName = "reference_citation"; referenceMarkerField.isTextual = true; - referenceMarkerField.grobidPath.add(Pair.of("//ref[@type=\"bibr\"]/text()", XPathConstants.NODESET)); - referenceMarkerField.nlmPath.add(Pair.of("//xref[@ref-type=\"bibr\"]/text()", XPathConstants.NODESET)); + referenceMarkerField.grobidPath.add(Pair.of(xPath.compile("//ref[@type=\"bibr\"]/text()"), XPathConstants.NODESET)); + referenceMarkerField.nlmPath.add(Pair.of(xPath.compile("//xref[@ref-type=\"bibr\"]/text()"), XPathConstants.NODESET)); fulltextFields.add(referenceMarkerField); fulltextLabels.add("reference_citation"); FieldSpecification referenceFigureField = new FieldSpecification(); referenceFigureField.fieldName = "reference_figure"; referenceFigureField.isTextual = true; - referenceFigureField.grobidPath.add(Pair.of("//ref[@type=\"figure\"]/text()", XPathConstants.NODESET)); - referenceFigureField.nlmPath.add(Pair.of("//xref[@ref-type=\"fig\"]/text()", XPathConstants.NODESET)); + referenceFigureField.grobidPath.add(Pair.of(xPath.compile("//ref[@type=\"figure\"]/text()"), XPathConstants.NODESET)); + referenceFigureField.nlmPath.add(Pair.of(xPath.compile("//xref[@ref-type=\"fig\"]/text()"), XPathConstants.NODESET)); fulltextFields.add(referenceFigureField); fulltextLabels.add("reference_figure"); FieldSpecification referenceTableField = new FieldSpecification(); referenceTableField.fieldName = "reference_table"; referenceTableField.isTextual = true; - referenceTableField.grobidPath.add(Pair.of("//ref[@type=\"table\"]/text()", XPathConstants.NODESET)); - referenceTableField.nlmPath.add(Pair.of("//xref[@ref-type=\"table\"]/text()", XPathConstants.NODESET)); + referenceTableField.grobidPath.add(Pair.of(xPath.compile("//ref[@type=\"table\"]/text()"), XPathConstants.NODESET)); + referenceTableField.nlmPath.add(Pair.of(xPath.compile("//xref[@ref-type=\"table\"]/text()"), XPathConstants.NODESET)); fulltextFields.add(referenceTableField); fulltextLabels.add("reference_table"); FieldSpecification figureTitleField = new FieldSpecification(); figureTitleField.fieldName = "figure_title"; figureTitleField.isTextual = true; - figureTitleField.grobidPath.add(Pair.of("//figure[not(@type)]/head/text()", XPathConstants.NODESET)); - figureTitleField.nlmPath.add(Pair.of("//fig/label/text()", XPathConstants.NODESET)); + figureTitleField.grobidPath.add(Pair.of(xPath.compile("//figure[not(@type)]/head/text()"), XPathConstants.NODESET)); + figureTitleField.nlmPath.add(Pair.of(xPath.compile("//fig/label/text()"), XPathConstants.NODESET)); fulltextFields.add(figureTitleField); fulltextLabels.add("figure_title"); @@ -355,8 +357,8 @@ public static void setUpFields(List headerFields, FieldSpecification tableTitleField = new FieldSpecification(); tableTitleField.fieldName = "table_title"; tableTitleField.isTextual = true; - tableTitleField.grobidPath.add(Pair.of("//figure[@type=\"table\"]/head/text()", XPathConstants.NODESET)); - tableTitleField.nlmPath.add(Pair.of("//table-wrap/label/text()", XPathConstants.NODESET)); + tableTitleField.grobidPath.add(Pair.of(xPath.compile("//figure[@type=\"table\"]/head/text()"), XPathConstants.NODESET)); + tableTitleField.nlmPath.add(Pair.of(xPath.compile("//table-wrap/label/text()"), XPathConstants.NODESET)); fulltextFields.add(tableTitleField); fulltextLabels.add("table_title"); @@ -402,10 +404,10 @@ public static void setUpFields(List headerFields, dataAvailabilityFulltextField.fieldName = "data_availability"; dataAvailabilityFulltextField.isTextual = true; dataAvailabilityFulltextField.grobidPath - // .add(Pair.of("//div[@type=\"data_availability\"]//text()", XPathConstants.NODESET)); - .add(Pair.of("//node[@type=\"availability\"]//text()", XPathConstants.NODESET)); + // .add(Pair.of(xPath.compile("//div[@type=\"data_availability\"]//text()"), XPathConstants.NODESET)); + .add(Pair.of(xPath.compile("//node[@type=\"availability\"]//text()"), XPathConstants.NODESET)); dataAvailabilityFulltextField.grobidPath - .add(Pair.of("//div[@type=\"availability\"]//text()", XPathConstants.NODESET)); + .add(Pair.of(xPath.compile("//div[@type=\"availability\"]//text()"), XPathConstants.NODESET)); //translate(x, "...", "...") is the ugly version of lower-case(.) which is not supported here apparently (only xpath 2.0) /* @@ -420,13 +422,17 @@ public static void setUpFields(List headerFields, */ dataAvailabilityFulltextField.nlmPath -// .add(Pair.of("normalize-space(.//article/body/sec[title[" + xpathTitle + "]])", XPathConstants.STRING)); - .add(Pair.of("normalize-space(.//sec[@sec-type=\"availability\"])", XPathConstants.STRING)); +// .add(Pair.of(xPath.compile("normalize-space(.//article/body/sec[title[" + xpathTitle + "]])"), XPathConstants.STRING)); + .add(Pair.of(xPath.compile("normalize-space(.//sec[@sec-type=\"availability\"])"), XPathConstants.STRING)); + dataAvailabilityFulltextField.nlmPath + .add(Pair.of(xPath.compile("normalize-space(.//p[@content-type=\"availability\"])"), XPathConstants.STRING)); + dataAvailabilityFulltextField.nlmPath + .add(Pair.of(xPath.compile("normalize-space(.//sec[@specific-use=\"availability\"])"), XPathConstants.STRING)); // dataAvailabilityFulltextField.nlmPath -// .add(Pair.of("normalize-space(.//article/back/sec[title[" + xpathTitle + "]])", XPathConstants.STRING)); -// .add(Pair.of("normalize-space(.//article/back//sec[@sec-type=\"data-availability\"])", XPathConstants.STRING)); +// .add(Pair.of(xPath.compile("normalize-space(.//article/back/sec[title[" + xpathTitle + "]])"), XPathConstants.STRING)); +// .add(Pair.of(xPath.compile("normalize-space(.//article/back//sec[@sec-type=\"data-availability\"])"), XPathConstants.STRING)); dataAvailabilityFulltextField.nlmPath - .add(Pair.of("normalize-space(.//p[@content-type=\"availability\"])", XPathConstants.STRING)); + .add(Pair.of(xPath.compile("normalize-space(.//p[@content-type=\"availability\"])"), XPathConstants.STRING)); fulltextFields.add(dataAvailabilityFulltextField); fulltextLabels.add("availability");