From 470c954d6a5f9486c2330ce830198c4ccf76390d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felicitas=20L=C3=B6ffler?= Date: Sun, 4 Dec 2022 23:03:22 +0100 Subject: [PATCH] initail commit --- Mimir6.2/plugins/CFIDFExactScorer/pom.xml | 76 ++ .../cs/fusion/score/CFIDFExactScorer.java | 395 ++++++++++ .../src/main/resources/creole.xml | 1 + .../src/main/resources/log4j.properties | 13 + .../src/main/resources/logback.xml | 39 + Mimir6.2/plugins/HCFIDFScorer/pom.xml | 71 ++ .../unijena/cs/fusion/score/HCFIDFScorer.java | 482 ++++++++++++ .../src/main/resources/creole.xml | 1 + .../src/main/resources/logback.xml | 39 + Mimir6.2/plugins/TaxonomicScorer/pom.xml | 72 ++ .../cs/fusion/score/TaxonomicScorer.java | 478 ++++++++++++ .../src/main/resources/creole.xml | 1 + .../src/main/resources/log4j.properties | 13 + .../src/main/resources/logback.xml | 39 + .../mimir/cloud/MimirScorerService.groovy | 48 ++ MimirSearchAPI/pom.xml | 50 ++ .../de/unijena/cs/fusion/mimir/Document.java | 130 +++ .../java/de/unijena/cs/fusion/mimir/Hit.java | 50 ++ .../de/unijena/cs/fusion/mimir/Metadata.java | 24 + .../unijena/cs/fusion/mimir/MimirSearch.java | 737 ++++++++++++++++++ MimirSearchAPI/src/main/resources/logback.xml | 38 + .../src/test/java/test/MimirSearchTest.java | 51 ++ MimirTest/mimirTest/pom.xml | 53 ++ .../cs/fusion/mimirTest/EvaluationThread.java | 200 +++++ .../cs/fusion/mimirTest/MimirTest.java | 413 ++++++++++ .../de/unijena/cs/fusion/mimirTest/Query.java | 33 + .../mimirTest/src/main/resources/logback.xml | 40 + .../uni_jena/cs/fusion/mimirTest/AppTest.java | 38 + .../slib/sml/sm/core/engine/SM_Engine.java | 84 ++ slibAPI/pom.xml | 203 +++++ .../java/de/unijena/cs/fusion/slib/SML.java | 301 +++++++ slibAPI/src/main/resources/logback.xml | 37 + 32 files changed, 4250 insertions(+) create mode 100644 Mimir6.2/plugins/CFIDFExactScorer/pom.xml create mode 100644 Mimir6.2/plugins/CFIDFExactScorer/src/main/java/de/unijena/cs/fusion/score/CFIDFExactScorer.java create mode 100644 Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/creole.xml create mode 100644 Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/log4j.properties create mode 100644 Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/logback.xml create mode 100644 Mimir6.2/plugins/HCFIDFScorer/pom.xml create mode 100644 Mimir6.2/plugins/HCFIDFScorer/src/main/java/de/unijena/cs/fusion/score/HCFIDFScorer.java create mode 100644 Mimir6.2/plugins/HCFIDFScorer/src/main/resources/creole.xml create mode 100644 Mimir6.2/plugins/HCFIDFScorer/src/main/resources/logback.xml create mode 100644 Mimir6.2/plugins/TaxonomicScorer/pom.xml create mode 100644 Mimir6.2/plugins/TaxonomicScorer/src/main/java/de/unijena/cs/fusion/score/TaxonomicScorer.java create mode 100644 Mimir6.2/plugins/TaxonomicScorer/src/main/resources/creole.xml create mode 100644 Mimir6.2/plugins/TaxonomicScorer/src/main/resources/log4j.properties create mode 100644 Mimir6.2/plugins/TaxonomicScorer/src/main/resources/logback.xml create mode 100644 Mimir6.2/webapp/mimir-cloud/grails-app/services/gate/mimir/cloud/MimirScorerService.groovy create mode 100644 MimirSearchAPI/pom.xml create mode 100644 MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Document.java create mode 100644 MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Hit.java create mode 100644 MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Metadata.java create mode 100644 MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/MimirSearch.java create mode 100644 MimirSearchAPI/src/main/resources/logback.xml create mode 100644 MimirSearchAPI/src/test/java/test/MimirSearchTest.java create mode 100644 MimirTest/mimirTest/pom.xml create mode 100644 MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/EvaluationThread.java create mode 100644 MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/MimirTest.java create mode 100644 MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/Query.java create mode 100644 MimirTest/mimirTest/src/main/resources/logback.xml create mode 100644 MimirTest/mimirTest/src/test/java/de/uni_jena/cs/fusion/mimirTest/AppTest.java create mode 100644 slib-sml/src/main/java/slib/sml/sm/core/engine/SM_Engine.java create mode 100644 slibAPI/pom.xml create mode 100644 slibAPI/src/main/java/de/unijena/cs/fusion/slib/SML.java create mode 100644 slibAPI/src/main/resources/logback.xml diff --git a/Mimir6.2/plugins/CFIDFExactScorer/pom.xml b/Mimir6.2/plugins/CFIDFExactScorer/pom.xml new file mode 100644 index 0000000..7f627ff --- /dev/null +++ b/Mimir6.2/plugins/CFIDFExactScorer/pom.xml @@ -0,0 +1,76 @@ + + + + 4.0.0 + + + uk.ac.gate + gate-plugin-base + + 8.6 + + + + + + + uk.ac.gate.mimir + mimir-plugin-CFIDFExactScorer + 6.2-SNAPSHOT + + CFIDFExactScorer + blbla + + + + + GNU Lesser General Public License (LGPL), Version 3 + http://www.gnu.org/licenses/lgpl-3.0.txt + repo + + + + + UTF-8 + 1.8 + 1.8 + + + + + + + uk.ac.gate.mimir + mimir-core + 6.2-SNAPSHOT + provided + + + + + + + + junit + junit + 4.12 + + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-csv + 2.10.3 + + + + de.unijena.cs.fusion + slibAPI + 0.0.1 + + + + diff --git a/Mimir6.2/plugins/CFIDFExactScorer/src/main/java/de/unijena/cs/fusion/score/CFIDFExactScorer.java b/Mimir6.2/plugins/CFIDFExactScorer/src/main/java/de/unijena/cs/fusion/score/CFIDFExactScorer.java new file mode 100644 index 0000000..bcb1834 --- /dev/null +++ b/Mimir6.2/plugins/CFIDFExactScorer/src/main/java/de/unijena/cs/fusion/score/CFIDFExactScorer.java @@ -0,0 +1,395 @@ + +package de.unijena.cs.fusion.score; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import gate.mimir.ConstraintType; +import gate.mimir.SemanticAnnotationHelper; +import gate.mimir.search.QueryEngine; +import gate.mimir.search.query.Binding; +import gate.mimir.search.query.QueryExecutor; +import gate.mimir.search.query.QueryNode; +import gate.mimir.search.score.MimirScorer; +import gate.mimir.util.DelegatingSemanticAnnotationHelper; +import it.unimi.di.big.mg4j.index.Index; +import it.unimi.di.big.mg4j.index.IndexIterator; +import it.unimi.di.big.mg4j.search.DocumentIterator; +import it.unimi.di.big.mg4j.search.score.AbstractWeightedScorer; +import it.unimi.di.big.mg4j.search.visitor.CounterCollectionVisitor; +import it.unimi.di.big.mg4j.search.visitor.CounterSetupVisitor; +import it.unimi.di.big.mg4j.search.visitor.TermCollectionVisitor; +import it.unimi.dsi.fastutil.ints.IntBigList; + + + +public class CFIDFExactScorer extends AbstractWeightedScorer implements MimirScorer{ + private static final long serialVersionUID = 3855212427922484546L; + + + private static final boolean DEBUG = true; + + /** The counter collection visitor used to estimate counts. */ + private final CounterCollectionVisitor counterCollectionVisitor; + /** The counter setup visitor used to estimate counts. */ + private final CounterSetupVisitor setupVisitor; + /** The term collection visitor used to estimate counts. */ + private final TermCollectionVisitor termVisitor; + + /** An array (parallel to {@link #currIndex}) that caches size lists. */ + private IntBigList sizes[]; + /** An array (parallel to {@link #currIndex}) used by {@link #score()} to cache the current document sizes. */ + private int[] size; + /** An array indexed by offsets that caches the inverse document-frequency part of the formula, multiplied by the index weight. */ + private double[] weightedIdfPart; + + private ArrayList URIsInQuery; + + private ArrayList annTypesInQuery; + + + + + private static double ALPHA = 0.5; + + double alphaE = 1.0; + double alphaC = 1.0; + + boolean alpha_beta = false; + + private static final Logger logger = LoggerFactory.getLogger(CFIDFExactScorer.class); + + + + + public CFIDFExactScorer() { + + termVisitor = new TermCollectionVisitor(); + setupVisitor = new CounterSetupVisitor( termVisitor ); + counterCollectionVisitor = new CounterCollectionVisitor( setupVisitor ); + + + } + + + public synchronized CFIDFExactScorer copy() { + final CFIDFExactScorer scorer = new CFIDFExactScorer(); + scorer.setWeights( index2Weight ); + return scorer; + } + + @Override + public double score(Index index) throws IOException { + return score(); + } + + /** + * computes the score per document for a given query + */ + @Override + public double score() throws IOException { + + + setupVisitor.clear(); + documentIterator.acceptOnTruePaths( counterCollectionVisitor ); + this.underlyingExecutor = (QueryExecutor)documentIterator; + QueryEngine engine = this.underlyingExecutor.getQueryEngine(); + + if ( documentIterator instanceof IndexIterator ) indexIterator = new IndexIterator[] { (IndexIterator)documentIterator }; + + final long document = documentIterator.document(); + //logger.info("document: "+document); + + final int[] count = setupVisitor.count; //TF - Array with frequencies of a term in a document + final int[] indexNumber = setupVisitor.indexNumber; + final double[] weightedIdfPart = this.weightedIdfPart; //idf weights + final int[] size = this.size; //Array of document sizes (document length, important for normalization) + + String[] terms = setupVisitor.termId2Term; + + HashMap URI_id = new HashMap(); + HashMap id_URI = new HashMap(); + + HashMap URI_broader_id = new HashMap(); + HashMap id_broader_URI = new HashMap(); + + HashMap URI_category = new HashMap(); + HashMap category_URI = new HashMap(); + //create a URI-ID Map + + + for (int i = 0; i< terms.length; i++){ + if(count[ i ] != 0 && terms[i].contains(":")){ + logger.info("terms["+i+"]="+terms[i]); + String annType = terms[i].split(":")[0]; + //System.out.println(annType); + SemanticAnnotationHelper annHelper = engine.getAnnotationHelper(annType); + + if(annHelper!=null){ + String annotation = annHelper.describeMention(terms[i]); + + + //System.out.println(annotation); + String URI = extractInstFromAnnotation(annotation); + String broader = extractBroaderFromAnnotation(annotation); + //logger.info(URI); + //logger.info(broader); + if(URI == null){ + //ToDo + } + if(URI != null){ + URI_id.put(URI, i); + id_URI.put(i, URI); + } + if(broader != null){ + id_broader_URI.put(i, broader); + URI_broader_id.put(broader, i); + } + + } + + + } + } + + //logger.info("inst_id:"+ URI_id.keySet()); + //logger.info("broader_id:"+ id_broader_URI.keySet()); + //i = number of total documents + + for( int i = size.length; i-- != 0; ) size[ i ] = sizes[ i ].getInt( document ); + + int k; + double score = 0; + //System.out.println("count.length: "+ count.length); + + + for ( int i = count.length; i-- != 0; ) { + + k = indexNumber[ i ]; + + + if(count[ i ] != 0 && terms[i].contains(":")){ + + + String uri = id_URI.get(i); + String relatedURI = id_broader_URI.get(i); + + //logger.info("count["+i+"]:"+count[ i ]); + //logger.info("size["+k+"]:"+size[ k ]); + //logger.info("weightedIdfPart["+i+"]:"+weightedIdfPart[i ]); + + + //exact match + if((uri!=null|relatedURI!=null) && URIsInQuery!=null && (URIsInQuery.contains(uri) || URIsInQuery.contains(relatedURI) )){ + + + score += ((double)count[ i ] / size[ k ] * weightedIdfPart[ i ]) ; + + } + + + } + //no fallback - only URI based ranking + } + + + logger.info("score: "+ score); + logger.info("---------------------"); + return score; + + + + } + + + + +/** + * extracts the inst feature value from an annotation String in the form {Material inst = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return inst feature, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ + private String extractInstFromAnnotation(String annotation) { + //System.out.println("Annotation: " + annotation); + String patternInst = "\\binst = ((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+)"; + + if(annotation!=null && annotation.length()>0){ + Pattern r = Pattern.compile(patternInst); + Matcher m = r.matcher(annotation); + if (m.find()) { + + return m.group(1); + + } + } + + return null; +} + + /** + * extracts the broader feature value from an annotation String in the form {Material broader = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return broader feature, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ + private String extractBroaderFromAnnotation(String annotation) { + //System.out.println("Annotation: " + annotation); + String pattern = "\\bbroader = ((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+)"; + + if(annotation!=null && annotation.length()>0){ + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(annotation); + if (m.find()) { + return m.group(1); + } + } + + return null; +} + + + + /** + * extracts a URI from a QueryString in the form {Material inst = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return List, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ + private ArrayList extractURIFromQuery(String query) { + //System.out.println("Annotation: " + annotation); + ArrayList uris = new ArrayList(); + + String patternUri = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+"; + + if(query!=null && query.length()>0){ + + + Pattern r = Pattern.compile(patternUri); + Matcher m = r.matcher(query); + while (m.find()) { + uris.add(m.group(0)); + } + } + + return uris; +} + + + +@Override + public boolean usesIntervals() { + return false; + } + + + @Override + public void wrap(DocumentIterator documentIterator) throws IOException { + super.wrap(documentIterator); + this.underlyingExecutor = (QueryExecutor)documentIterator; + + + QueryNode query = this.underlyingExecutor.getQueryNode(); + + + + String querySegmentString = query.toString(); + logger.info("Query: " +querySegmentString); + + URIsInQuery = extractURIFromQuery(querySegmentString); + annTypesInQuery = extractAnnTypesFromQuery(querySegmentString); + + logger.info("URIs in query:" +URIsInQuery); + + QueryEngine engine = this.underlyingExecutor.getQueryEngine(); + + + /* Note that we use the index array provided by the weight function, *not* by the visitor or by the iterator. + * If the function has an empty domain, this call is equivalent to prepare(). */ + termVisitor.prepare( index2Weight.keySet() ); + + //Visitor pattern, accept method calls the visit Method in TermCollectionVisitor + //here, the actual "filling" takes place + documentIterator.accept( termVisitor ); + + //if ( DEBUG ) logger.debug( "Term Visitor found " + termVisitor.numberOfPairs() + " leaves" ); + + // Note that we use the index array provided by the visitor, *not* by the iterator. + final Index[] index = termVisitor.indices(); + + + + if ( DEBUG ) logger.debug( "Indices: " + Arrays.toString( index ) ); + + // Some caching of frequently-used values + sizes = new IntBigList[ index.length ]; + //System.out.println("Index.length:"+index.length); + for( int i = index.length; i-- != 0; ) + if ( ( sizes[ i ] = index[ i ].sizes ) == null ) throw new IllegalStateException( "A BM25 scorer requires document sizes" ); + + setupVisitor.prepare(); + + + documentIterator.accept( setupVisitor ); + + final long[] frequency = setupVisitor.frequency; + final int[] indexNumber = setupVisitor.indexNumber; + + + + // We do all logs here, and multiply by the weight + weightedIdfPart = new double[ frequency.length ]; + for( int i = weightedIdfPart.length; i-- != 0; ) { + //System.out.println("frequency:"+frequency[i]); + logger.info("frequency["+i+"]:"+frequency[i]); + logger.info("indexNumber["+i+"]:"+indexNumber[i]); + logger.info("index2Weight.index[indexNumber["+i+"]]:"+index2Weight.getDouble( index[ indexNumber[ i ] ] )); + logger.info("index[indexNumber["+i+"]].numberOfDocuments:"+index[ indexNumber[ i ] ].numberOfDocuments ); + + weightedIdfPart[ i ] = Math.log( index[ indexNumber[ i ] ].numberOfDocuments / (double)frequency[ i ] ) * index2Weight.getDouble( index[ indexNumber[ i ] ] ); + //System.out.println("weightedIdfPart[i]: " + weightedIdfPart[ i ]); + logger.info("weightedIdfPart["+i+"]: " + weightedIdfPart[ i ]); + } + size = new int[ index.length ]; + + + } + +private ArrayList extractAnnTypesFromQuery(String query) { + //System.out.println("Annotation: " + annotation); + ArrayList annTypes = new ArrayList(); + + //String patternUri = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+"; + String patternType = "\\btype = ([A-Za-z]+)"; + if(query!=null && query.length()>0){ + + + Pattern r = Pattern.compile(patternType); + Matcher m = r.matcher(query); + while (m.find()) { + annTypes.add(m.group(1)); + } + } + + return annTypes; +} + +protected QueryExecutor underlyingExecutor; + + + + public long nextDocument(long greaterThan) throws IOException { + return underlyingExecutor.nextDocument(greaterThan); + } + + public Binding nextHit() throws IOException { + return underlyingExecutor.nextHit(); + } + + +} diff --git a/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/creole.xml b/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/creole.xml new file mode 100644 index 0000000..2a2d21b --- /dev/null +++ b/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/creole.xml @@ -0,0 +1 @@ + diff --git a/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/log4j.properties b/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/log4j.properties new file mode 100644 index 0000000..44e8045 --- /dev/null +++ b/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/log4j.properties @@ -0,0 +1,13 @@ +# Root logger option +log4j.rootLogger=DEBUG, console, file + +# console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=[%t] %-5p %c %x - %m%n + +# file +log4j.appender.file=org.apache.log4j.RollingFileAppender +log4j.appender.file.layout=org.apache.log4j.PatternLayout +log4j.appender.file.layout.ConversionPattern=[%t] %-5p %c %x - %m%n +log4j.appender.file.File=C:/Benutzer/Felicitas_adm/mimir/log.txt \ No newline at end of file diff --git a/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/logback.xml b/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/logback.xml new file mode 100644 index 0000000..0d2b383 --- /dev/null +++ b/Mimir6.2/plugins/CFIDFExactScorer/src/main/resources/logback.xml @@ -0,0 +1,39 @@ + + + + + logs/output.log + + + logs/output-%d{yyyy-MM-dd}.%i.log + + + 100MB + + + + + %d - %-5level %logger{36} - %msg %n + + false + + + + + + %d - %-5level %logger{36} - %msg %n + + + + + + + + + + + + + + + diff --git a/Mimir6.2/plugins/HCFIDFScorer/pom.xml b/Mimir6.2/plugins/HCFIDFScorer/pom.xml new file mode 100644 index 0000000..fa8de62 --- /dev/null +++ b/Mimir6.2/plugins/HCFIDFScorer/pom.xml @@ -0,0 +1,71 @@ + + + + 4.0.0 + + + uk.ac.gate + gate-plugin-base + + 8.6 + + + + + + + uk.ac.gate.mimir + mimir-plugin-HCFIDFScorer + 6.2-SNAPSHOT + + HCFIDFScorer + blbla + + + + + GNU Lesser General Public License (LGPL), Version 3 + http://www.gnu.org/licenses/lgpl-3.0.txt + repo + + + + + UTF-8 + 1.8 + 1.8 + + + + + + uk.ac.gate.mimir + mimir-core + 6.2-SNAPSHOT + provided + + + + + + + + junit + junit + 4.12 + + + + + de.unijena.cs.fusion + slibAPI + 0.0.1 + + + + + + diff --git a/Mimir6.2/plugins/HCFIDFScorer/src/main/java/de/unijena/cs/fusion/score/HCFIDFScorer.java b/Mimir6.2/plugins/HCFIDFScorer/src/main/java/de/unijena/cs/fusion/score/HCFIDFScorer.java new file mode 100644 index 0000000..31a2048 --- /dev/null +++ b/Mimir6.2/plugins/HCFIDFScorer/src/main/java/de/unijena/cs/fusion/score/HCFIDFScorer.java @@ -0,0 +1,482 @@ +package de.unijena.cs.fusion.score; + + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.openrdf.model.URI; +import org.openrdf.model.impl.URIImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.unijena.cs.fusion.slib.SML; +import gate.mimir.ConstraintType; +import gate.mimir.SemanticAnnotationHelper; +import gate.mimir.search.QueryEngine; +import gate.mimir.search.query.Binding; +import gate.mimir.search.query.QueryExecutor; +import gate.mimir.search.query.QueryNode; +import gate.mimir.search.score.MimirScorer; +import gate.mimir.util.DelegatingSemanticAnnotationHelper; +import it.unimi.di.big.mg4j.index.Index; +import it.unimi.di.big.mg4j.index.IndexIterator; +import it.unimi.di.big.mg4j.search.DocumentIterator; +import it.unimi.di.big.mg4j.search.score.AbstractWeightedScorer; +import it.unimi.di.big.mg4j.search.visitor.CounterCollectionVisitor; +import it.unimi.di.big.mg4j.search.visitor.CounterSetupVisitor; +import it.unimi.di.big.mg4j.search.visitor.TermCollectionVisitor; +import it.unimi.dsi.fastutil.ints.IntBigList; +import slib.sml.sm.core.engine.SM_Engine; + + +public class HCFIDFScorer extends AbstractWeightedScorer implements MimirScorer{ + private static final long serialVersionUID = 3855212427922484546L; + + private static final boolean DEBUG = true; + + /** The counter collection visitor used to estimate counts. */ + private final CounterCollectionVisitor counterCollectionVisitor; + /** The counter setup visitor used to estimate counts. */ + private final CounterSetupVisitor setupVisitor; + /** The term collection visitor used to estimate counts. */ + private final TermCollectionVisitor termVisitor; + + /** An array (parallel to {@link #currIndex}) that caches size lists. */ + private IntBigList sizes[]; + /** An array (parallel to {@link #currIndex}) used by {@link #score()} to cache the current document sizes. */ + private int[] size; + /** An array indexed by offsets that caches the inverse document-frequency part of the formula, multiplied by the index weight. */ + private double[] weightedIdfPart; + + + private ArrayList URIsInQuery; + + + private HashMap graphMap; + + private HashMap graphNodeLevelMap; + private HashMap graphURINodeLevelMap; + + + + private static final Logger logger = LoggerFactory.getLogger(HCFIDFScorer.class); + + public HCFIDFScorer(SML sml) { + + termVisitor = new TermCollectionVisitor(); + setupVisitor = new CounterSetupVisitor( termVisitor ); + counterCollectionVisitor = new CounterCollectionVisitor( setupVisitor ); + + graphMap = sml.getGraphMap(); + graphNodeLevelMap = sml.getGraphNodeLevelMap(); + graphURINodeLevelMap = sml.getGraphURINodeLevelMap(); + + } + + + public synchronized HCFIDFScorer copy() { + final HCFIDFScorer scorer = new HCFIDFScorer(new SML()); + scorer.setWeights( index2Weight ); + return scorer; + } + + @Override + public double score(Index index) throws IOException { + return score(); + } + + /** + * computes the score per document for a given query + */ + @Override + public double score() throws IOException { + + + setupVisitor.clear(); + documentIterator.acceptOnTruePaths( counterCollectionVisitor ); + this.underlyingExecutor = (QueryExecutor)documentIterator; + QueryEngine engine = this.underlyingExecutor.getQueryEngine(); + //MimirIndex mimirIndex = engine.getIndex(); + + if ( documentIterator instanceof IndexIterator ) indexIterator = new IndexIterator[] { (IndexIterator)documentIterator }; + + final long document = documentIterator.document(); + //System.out.println("document: "+document); + + final int[] count = setupVisitor.count; //TF - Array with frequencies of a term in a document + final int[] indexNumber = setupVisitor.indexNumber; + final double[] weightedIdfPart = this.weightedIdfPart; //idf weights + final int[] size = this.size; //Array of document sizes + + String[] terms = setupVisitor.termId2Term; + + HashMap URI_id = new HashMap(); + HashMap id_URI = new HashMap(); + //create a URI-ID Map + + for (int i = 0; i< terms.length; i++){ + if(count[ i ] != 0 && terms[i].contains(":")){ + //System.out.println("terms["+i+"]="+terms[i]); + String annType = terms[i].split(":")[0]; + //System.out.println(annType); + SemanticAnnotationHelper annHelper = engine.getAnnotationHelper(annType); + //System.out.println(term); + if(annHelper!=null){ + String annotation = annHelper.describeMention(terms[i]); + logger.info(annotation); + String URI = extractURIFromAnnotation(annotation); + logger.info(URI); + if(URI == null){ + //ToDO + } + if(URI != null){ + URI_id.put(URI, i); + id_URI.put(i, URI); + } + + } + } + } + + //System.out.println("URI_id:"+ URI_id.keySet()); + + //i = number of total documents + + for( int i = size.length; i-- != 0; ) size[ i ] = sizes[ i ].getInt( document ); + + int k; + double score = 0; + //System.out.println("count.length: "+ count.length); + + + + for ( int i = count.length; i-- != 0; ) { + + k = indexNumber[ i ]; + + //if we have URIs - compute HCF-IDF + + + + if(count[ i ] != 0 && terms[i].contains(":")){ + + Set URIs = new HashSet(); + String uri = id_URI.get(i); + //System.out.println(uri); + + if(uri!=null){ + URIs.add(new URIImpl(uri)); + double bellLog = (double)BellLog(URIs,count,size,indexNumber,URI_id); + //System.out.println("bellLog:" + bellLog); + score += bellLog * weightedIdfPart[i]; + } + else{ + //System.out.println("no URI found for id:"+i); + + } + + + } + //no fallback, no URI - no ranking + + } + + + logger.info("score: "+ score); + logger.info("---------------------"); + return score; + + + + } + + + private double BellLog(Set uris, int[] count, int[] size, int[] indexNumber, Map URI_id){ + double bellLog = 0.01; + + //long nodeLevel = getNodeLevel(uri); + if(uris!=null && uris.size()>0){ + for(URI uri : uris){ + Set children = getNextChildren(uri); + Object o = URI_id.get(uri.toString()); + String graphName = getGraphName(uri.toString()); + + if(children==null || children.size()<1){ + bellLog+= 0.01; + } + + if(o!=null){ + int id = (int) o; + int k = indexNumber[id]; + + double childrenBellLog = BellLog(children,count, size,indexNumber,URI_id); + //System.out.println("childrenBellLog: "+childrenBellLog); + + + bellLog+= (double)(count[id] / size[k]) + FL(uri, graphName) * childrenBellLog; + //System.out.println("temp bellLog: "+bellLog); + } + + + + } + } + //System.out.println("BellLog: "+bellLog); + return bellLog; + } + + + private Set getNextChildren(URI uri) { + Set children = new HashSet(); + + String vocab = getGraphName(uri.toString()); + long nodeLevel = getNodeLevel(uri); + + + SM_Engine engine = (SM_Engine)graphMap.get(vocab); + + if(engine==null) + return children; + + Set allChildren = engine.getChildren(uri); + + Set nodesOnNextLevel = nodesAtLevel(nodeLevel+1,vocab); + + if(nodesOnNextLevel==null) + return children; + + //intersection of allChildren and nodesOnNextLevel = children + for (URI u: nodesOnNextLevel) + { + if (allChildren.contains(u)) + children.add(u); + } + + + return children; +} + + +private double FL(URI uri, String graphName){ + //System.out.println("FL for URI "+uri); + long nodeLevel = 0; + double FL = 0.01; + + nodeLevel = getNodeLevel(uri); + //System.out.println("NodeLevel: "+nodeLevel); + Set nodesAtLevel = nodesAtLevel(nodeLevel+1,graphName); + //System.out.println("NodesAtLevel.size : "+nodesAtLevel.size()); + if(nodesAtLevel == null || nodesAtLevel.size() == 0 ){ + //System.out.println("FL: 0.01"); + return 0.01; + } + else{ + + //add 0.5 for case: only 1 leave -> nodesAtLevel.size()=1, log1=0, 1/0=N/A + FL = (double)1/Math.log10(nodesAtLevel.size()+0.5); + //System.out.println("FL: " + FL); + return FL; + } + +} + +private Set nodesAtLevel(long nodeLevel, String graphName){ + Set URISet = null; + + //SM_Engine engine = (SM_Engine)graphMap.get(vocab.toLowerCase()); + HashMap> nodeLevelMap = (HashMap>) graphNodeLevelMap.get(graphName); + + + if(nodeLevelMap!=null && nodeLevelMap.get(nodeLevel)!=null){ + + URISet = nodeLevelMap.get(nodeLevel); + + } + + return URISet; + + +} + + +private Long getNodeLevel(URI URI){ + + String vocab = getGraphName(URI.toString()); + + //HashMap nodeLevelMap = (HashMap) graphNodeLevelMap.get(vocab); + HashMap nodeLevelMap = (HashMap) graphURINodeLevelMap.get(vocab); + + if(nodeLevelMap!=null) + return nodeLevelMap.get(URI); + + return new Long(0); + } + + +private static String getGraphName(String URI) { + String[] graphURIName = URI.split("/"); + String[] vocabNameArray = graphURIName[graphURIName.length-1].split("_"); + String vocab=vocabNameArray[0]; + + return vocab.toLowerCase(); + +} + +/** + * extracts the broader feature value from an annotation String in the form {Material broader = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return broader feature, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ +private String extractBroaderFromAnnotation(String annotation) { + //System.out.println("Annotation: " + annotation); + String pattern = "\\bbroader = ((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+)"; + + if(annotation!=null && annotation.length()>0){ + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(annotation); + if (m.find()) { + return m.group(1); + } + } + + return null; +} + + +/** + * extracts a URI from an AnnotationString in the form {Material inst = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return URI, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ + private String extractURIFromAnnotation(String annotation) { + //System.out.println("Annotation: " + annotation); + String pattern = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+"; + + if(annotation!=null && annotation.length()>0){ + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(annotation); + if (m.find()) { + return m.group(0); + } + } + + return null; +} + + /** + * extracts a URI from a QueryString in the form {Material inst = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return List, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ + private ArrayList extractURIFromQuery(String query) { + //System.out.println("Annotation: " + annotation); + ArrayList uris = new ArrayList(); + + String patternUri = "\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+"; + String wc =""; + + if(query!=null && query.length()>0){ + + + Pattern r = Pattern.compile(patternUri); + Matcher m = r.matcher(query); + while (m.find()) { + uris.add(m.group(0)); + } + } + + return uris; +} + + + + + +@Override + public boolean usesIntervals() { + return false; + } + + + @Override + public void wrap(DocumentIterator documentIterator) throws IOException { + super.wrap(documentIterator); + this.underlyingExecutor = (QueryExecutor)documentIterator; + + QueryNode query = this.underlyingExecutor.getQueryNode(); + + String querySegmentString = query.toString(); + logger.info("Query: " +querySegmentString); + URIsInQuery = extractURIFromQuery(querySegmentString); + logger.info("URIs in query:" +URIsInQuery); + + //QueryEngine engine = this.underlyingExecutor.getQueryEngine(); + + + /* Note that we use the index array provided by the weight function, *not* by the visitor or by the iterator. + * If the function has an empty domain, this call is equivalent to prepare(). */ + termVisitor.prepare( index2Weight.keySet() ); + //System.out.println("index2Weight:"+index2Weight.keySet()); + + //Visitor pattern, accept method calls the visit Method in TermCollectionVisitor + //here, the actual "filling" takes place + documentIterator.accept( termVisitor ); + + //if ( DEBUG ) logger.debug( "Term Visitor found " + termVisitor.numberOfPairs() + " leaves" ); + + // Note that we use the index array provided by the visitor, *not* by the iterator. + final Index[] index = termVisitor.indices(); + + + + if ( DEBUG ) logger.debug( "Indices: " + Arrays.toString( index ) ); + + // Some caching of frequently-used values + sizes = new IntBigList[ index.length ]; + //System.out.println("Index.length:"+index.length); + for( int i = index.length; i-- != 0; ) + if ( ( sizes[ i ] = index[ i ].sizes ) == null ) throw new IllegalStateException( "A BM25 scorer requires document sizes" ); + + setupVisitor.prepare(); + + + documentIterator.accept( setupVisitor ); + + final long[] frequency = setupVisitor.frequency; + final int[] indexNumber = setupVisitor.indexNumber; + + + + // We do all logs here, and multiply by the weight + weightedIdfPart = new double[ frequency.length ]; + for( int i = weightedIdfPart.length; i-- != 0; ) { + //System.out.println("frequency:"+frequency[i]); + weightedIdfPart[ i ] = Math.log( index[ indexNumber[ i ] ].numberOfDocuments / (double)frequency[ i ] ) * index2Weight.getDouble( index[ indexNumber[ i ] ] ); + //System.out.println("weightedIdfPart[i]: " + weightedIdfPart[ i ]); + } + size = new int[ index.length ]; + + + } + +protected QueryExecutor underlyingExecutor; + + + + public long nextDocument(long greaterThan) throws IOException { + return underlyingExecutor.nextDocument(greaterThan); + } + + public Binding nextHit() throws IOException { + return underlyingExecutor.nextHit(); + } + + +} diff --git a/Mimir6.2/plugins/HCFIDFScorer/src/main/resources/creole.xml b/Mimir6.2/plugins/HCFIDFScorer/src/main/resources/creole.xml new file mode 100644 index 0000000..2a2d21b --- /dev/null +++ b/Mimir6.2/plugins/HCFIDFScorer/src/main/resources/creole.xml @@ -0,0 +1 @@ + diff --git a/Mimir6.2/plugins/HCFIDFScorer/src/main/resources/logback.xml b/Mimir6.2/plugins/HCFIDFScorer/src/main/resources/logback.xml new file mode 100644 index 0000000..f3713d1 --- /dev/null +++ b/Mimir6.2/plugins/HCFIDFScorer/src/main/resources/logback.xml @@ -0,0 +1,39 @@ + + + + + logs/output.log + + + logs/output-%d{yyyy-MM-dd}.%i.log + + + 100MB + + + + + %d - %-5level %logger{36} - %msg %n + + false + + + + + + %d - %-5level %logger{36} - %msg %n + + + + + + + + + + + + + + + diff --git a/Mimir6.2/plugins/TaxonomicScorer/pom.xml b/Mimir6.2/plugins/TaxonomicScorer/pom.xml new file mode 100644 index 0000000..0301a15 --- /dev/null +++ b/Mimir6.2/plugins/TaxonomicScorer/pom.xml @@ -0,0 +1,72 @@ + + + + 4.0.0 + + + uk.ac.gate + gate-plugin-base + + 8.6 + + + + + + + uk.ac.gate.mimir + mimir-plugin-TaxonomicScorer + 6.2-SNAPSHOT + + TaxonomicScorer + blbla + + + + + GNU Lesser General Public License (LGPL), Version 3 + http://www.gnu.org/licenses/lgpl-3.0.txt + repo + + + + + UTF-8 + 1.8 + 1.8 + + + + + + + uk.ac.gate.mimir + mimir-core + 6.2-SNAPSHOT + provided + + + + + + + + junit + junit + 4.12 + + + + + + de.unijena.cs.fusion + slibAPI + 0.0.1 + + + + + diff --git a/Mimir6.2/plugins/TaxonomicScorer/src/main/java/de/unijena/cs/fusion/score/TaxonomicScorer.java b/Mimir6.2/plugins/TaxonomicScorer/src/main/java/de/unijena/cs/fusion/score/TaxonomicScorer.java new file mode 100644 index 0000000..6dffc20 --- /dev/null +++ b/Mimir6.2/plugins/TaxonomicScorer/src/main/java/de/unijena/cs/fusion/score/TaxonomicScorer.java @@ -0,0 +1,478 @@ + +package de.unijena.cs.fusion.score; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.unijena.cs.fusion.slib.SML; +import gate.mimir.ConstraintType; +import gate.mimir.SemanticAnnotationHelper; +import gate.mimir.search.QueryEngine; +import gate.mimir.search.query.Binding; +import gate.mimir.search.query.QueryExecutor; +import gate.mimir.search.query.QueryNode; +import gate.mimir.search.score.MimirScorer; +import gate.mimir.util.DelegatingSemanticAnnotationHelper; +import it.unimi.di.big.mg4j.index.Index; +import it.unimi.di.big.mg4j.index.IndexIterator; +import it.unimi.di.big.mg4j.search.DocumentIterator; +import it.unimi.di.big.mg4j.search.score.AbstractWeightedScorer; +import it.unimi.di.big.mg4j.search.visitor.CounterCollectionVisitor; +import it.unimi.di.big.mg4j.search.visitor.CounterSetupVisitor; +import it.unimi.di.big.mg4j.search.visitor.TermCollectionVisitor; +import it.unimi.dsi.fastutil.ints.IntBigList; +import slib.sml.sm.core.engine.SM_Engine; +import slib.utils.ex.SLIB_Ex_Critic; + + +public class TaxonomicScorer extends AbstractWeightedScorer implements MimirScorer{ + private static final long serialVersionUID = 3855212427922484546L; + + + private static final boolean DEBUG = true; + + /** The counter collection visitor used to estimate counts. */ + private final CounterCollectionVisitor counterCollectionVisitor; + /** The counter setup visitor used to estimate counts. */ + private final CounterSetupVisitor setupVisitor; + /** The term collection visitor used to estimate counts. */ + private final TermCollectionVisitor termVisitor; + + /** An array (parallel to {@link #currIndex}) that caches size lists. */ + private IntBigList sizes[]; + /** An array (parallel to {@link #currIndex}) used by {@link #score()} to cache the current document sizes. */ + private int[] size; + /** An array indexed by offsets that caches the inverse document-frequency part of the formula, multiplied by the index weight. */ + private double[] weightedIdfPart; + + + private ArrayList URIsInQuery; + + private ArrayList annTypesInQuery; + + + private HashMap graphMap; + + private HashMap graphNodeLevelMap; + //private HashMap countChildrenMap; + + private static double ALPHA = 0.5; + + double alphaE = 1.0; + double alphaC = 1.0; + + boolean alpha_beta = true; + + private static final Logger logger = LoggerFactory.getLogger(TaxonomicScorer.class); + + public SML sml; + + + public TaxonomicScorer(SML sml) { + + termVisitor = new TermCollectionVisitor(); + setupVisitor = new CounterSetupVisitor( termVisitor ); + counterCollectionVisitor = new CounterCollectionVisitor( setupVisitor ); + this.sml = sml; + + + } + + + public synchronized TaxonomicScorer copy() { + final TaxonomicScorer scorer = new TaxonomicScorer(new SML()); + scorer.setWeights( index2Weight ); + return scorer; + } + + @Override + public double score(Index index) throws IOException { + return score(); + } + + /** + * computes the score per document for a given query + */ + @Override + public double score() throws IOException { + + + setupVisitor.clear(); + documentIterator.acceptOnTruePaths( counterCollectionVisitor ); + this.underlyingExecutor = (QueryExecutor)documentIterator; + QueryEngine engine = this.underlyingExecutor.getQueryEngine(); + //MimirIndex mimirIndex = engine.getIndex(); + + if ( documentIterator instanceof IndexIterator ) indexIterator = new IndexIterator[] { (IndexIterator)documentIterator }; + + final long document = documentIterator.document(); + //logger.info("document: "+document); + + final int[] count = setupVisitor.count; //TF - Array with frequencies of a term in a document + final int[] indexNumber = setupVisitor.indexNumber; + final double[] weightedIdfPart = this.weightedIdfPart; //idf weights + final int[] size = this.size; //Array of document sizes (document length, important for normalization) + + String[] terms = setupVisitor.termId2Term; + + HashMap URI_id = new HashMap(); + HashMap id_URI = new HashMap(); + + HashMap URI_broader_id = new HashMap(); + HashMap id_broader_URI = new HashMap(); + + HashMap URI_category = new HashMap(); + HashMap category_URI = new HashMap(); + //create a URI-ID Map + + double cat_boost = 0.0; + + for (int i = 0; i< terms.length; i++){ + if(count[ i ] != 0 && terms[i].contains(":")){ + logger.info("terms["+i+"]="+terms[i]); + String annType = terms[i].split(":")[0]; + //System.out.println(annType); + SemanticAnnotationHelper annHelper = engine.getAnnotationHelper(annType); + //String term = mimirIndex.getAnnotationIndex(annType).getDirectTerm(i).toString(); + //System.out.println(term); + if(annHelper!=null){ + String annotation = annHelper.describeMention(terms[i]); + // + //System.out.println("Annotation: "+annotation); + if(annType == "Category"){ + String inst = extractInstFromAnnotation(annotation); + if(inst!=null){ + URI_category.put(inst, i); + category_URI.put(i, inst); + } + } + else{ + + //System.out.println(annotation); + String URI = extractInstFromAnnotation(annotation); + String broader = extractBroaderFromAnnotation(annotation); + logger.info(URI); + logger.info(broader); + if(URI == null){ + //ToDo + } + if(URI != null){ + + URI_id.put(URI, i); + id_URI.put(i, URI); + } + if(broader != null){ + id_broader_URI.put(i, broader); + URI_broader_id.put(URI, i); + } + } + } + + } + } + + //logger.info("inst_id:"+ URI_id.keySet()); + //logger.info("broader_id:"+ id_broader_URI.keySet()); + //i = number of total documents + + for( int i = size.length; i-- != 0; ) size[ i ] = sizes[ i ].getInt( document ); + + int k; + double score = 0; + //System.out.println("count.length: "+ count.length); + + double alpha = 1.0; + double beta = 1.0; + + for ( int i = count.length; i-- != 0; ) { + + k = indexNumber[ i ]; + + + if(count[ i ] != 0 && terms[i].contains(":")){ + + + String uri = id_URI.get(i); + String relatedURI = id_broader_URI.get(i); + + logger.info("count["+i+"]:"+count[ i ]); + logger.info("size["+k+"]:"+size[ k ]); + logger.info("weightedIdfPart["+i+"]:"+weightedIdfPart[i ]); + + if(alpha_beta) { + alphaE = computeAlphaE(ALPHA); + alphaC = computeAlphaC(ALPHA); + } + + //exact match + if(uri!=null && URIsInQuery!=null && URIsInQuery.contains(uri)){ + score += ((double)count[ i ] / size[ k ] * weightedIdfPart[ i ]) * alphaE; + + } + //no exact match but a related URI - compute semantic similarity and use that + else if(relatedURI!=null){ + logger.info("relatedURI: "+relatedURI); + double semSim = semanticSimilarity(URIsInQuery, relatedURI); + score += ((double)count[ i ] / size[ k ] * weightedIdfPart[ i ] * semSim) * alphaC; + } + + + + } + //no fallback, no URI - no ranking + else{ + System.out.println("No URI found for " + terms[i]); + + } + } + + + logger.info("score: "+ score); + logger.info("---------------------"); + return score; + + + + } + + + private double computeAlphaC(double alpha) { + double alphaC = 0.0; + + if(alpha > 0.0) { + alphaC = (1 - alpha) / (Math.sqrt((alpha*alpha) + ((1 - alpha) * (1 - alpha)))); + } + return alphaC; + } + + +private double computeAlphaE(double alpha) { + + double alphaE = 0.0; + + if(alpha > 0.0) { + alphaE = alpha / (Math.sqrt((alpha*alpha) + ((1 - alpha) * (1 - alpha)))); + } + return alphaE; +} + + +private double semanticSimilarity(ArrayList URIs, String uri) { + + double semSim=0; + + for(int i = 0; i < URIs.size(); i++){ + try { + + semSim += sml.computeSemSim(URIs.get(i), uri); + //logger.info("TaxScorer: semSim ("+ URIs.get(i)+ "," + uri+"):" + String.valueOf(semSim)); + + } catch (SLIB_Ex_Critic e) { + // TODO Auto-generated catch block + e.printStackTrace(); + + //if no semantic similarity can be computed, e.g., two URIs don't have the same root + //semSim=0 + semSim += 0; + } + } + logger.info("TaxScorer: semSim ("+ URIs.toString()+ "," + uri+"):" + String.valueOf(semSim)); + // System.out.println("TaxScorer: semSim ("+ URIs.toString()+ "," + uri+"):" + String.valueOf(semSim)); + return semSim; +} + + + + + +/** + * extracts the inst feature value from an annotation String in the form {Material inst = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return inst feature, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ + private String extractInstFromAnnotation(String annotation) { + //System.out.println("Annotation: " + annotation); + String patternInst = "\\binst = ((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+)"; + + if(annotation!=null && annotation.length()>0){ + Pattern r = Pattern.compile(patternInst); + Matcher m = r.matcher(annotation); + if (m.find()) { + + return m.group(1); + + } + } + + return null; +} + + /** + * extracts the broader feature value from an annotation String in the form {Material broader = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return broader feature, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ + private String extractBroaderFromAnnotation(String annotation) { + //System.out.println("Annotation: " + annotation); + String pattern = "\\bbroader = ((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+)"; + + if(annotation!=null && annotation.length()>0){ + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(annotation); + if (m.find()) { + return m.group(1); + } + } + + return null; +} + + + + /** + * extracts a URI from a QueryString in the form {Material inst = http://purl.obolibrary.org/obo/ENVO_00001998} + * @param annotation + * @return List, e.g., http://purl.obolibrary.org/obo/ENVO_00000109 + */ + private ArrayList extractURIFromQuery(String query) { + //System.out.println("Annotation: " + annotation); + ArrayList uris = new ArrayList(); + + String patternUri = "inst EQ ((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]_[0-9]+)"; + + if(query!=null && query.length()>0){ + + Pattern r = Pattern.compile(patternUri); + Matcher m = r.matcher(query); + while (m.find()) { + //System.out.println(m.group(0)); + //System.out.println(m.group(1)); + + uris.add(m.group(1)); + } + } + + return uris; +} + + + +@Override + public boolean usesIntervals() { + return false; + } + + + @Override + public void wrap(DocumentIterator documentIterator) throws IOException { + super.wrap(documentIterator); + this.underlyingExecutor = (QueryExecutor)documentIterator; + + + QueryNode query = this.underlyingExecutor.getQueryNode(); + + + + String querySegmentString = query.toString(); + logger.info("Query: " +querySegmentString); + + URIsInQuery = extractURIFromQuery(querySegmentString); + annTypesInQuery = extractAnnTypesFromQuery(querySegmentString); + + logger.info("URIs in query:" +URIsInQuery); + + QueryEngine engine = this.underlyingExecutor.getQueryEngine(); + + + /* Note that we use the index array provided by the weight function, *not* by the visitor or by the iterator. + * If the function has an empty domain, this call is equivalent to prepare(). */ + termVisitor.prepare( index2Weight.keySet() ); + //System.out.println("index2Weight:"+index2Weight.keySet()); + + //Visitor pattern, accept method calls the visit Method in TermCollectionVisitor + //here, the actual "filling" takes place + documentIterator.accept( termVisitor ); + + //if ( DEBUG ) logger.debug( "Term Visitor found " + termVisitor.numberOfPairs() + " leaves" ); + + // Note that we use the index array provided by the visitor, *not* by the iterator. + final Index[] index = termVisitor.indices(); + + + + if ( DEBUG ) logger.debug( "Indices: " + Arrays.toString( index ) ); + + // Some caching of frequently-used values + sizes = new IntBigList[ index.length ]; + //System.out.println("Index.length:"+index.length); + for( int i = index.length; i-- != 0; ) + if ( ( sizes[ i ] = index[ i ].sizes ) == null ) throw new IllegalStateException( "A BM25 scorer requires document sizes" ); + + setupVisitor.prepare(); + + + documentIterator.accept( setupVisitor ); + + final long[] frequency = setupVisitor.frequency; + final int[] indexNumber = setupVisitor.indexNumber; + + + + // We do all logs here, and multiply by the weight + weightedIdfPart = new double[ frequency.length ]; + for( int i = weightedIdfPart.length; i-- != 0; ) { + //System.out.println("frequency:"+frequency[i]); + logger.info("frequency["+i+"]:"+frequency[i]); + logger.info("indexNumber["+i+"]:"+indexNumber[i]); + logger.info("index2Weight.index[indexNumber["+i+"]]:"+index2Weight.getDouble( index[ indexNumber[ i ] ] )); + logger.info("index[indexNumber["+i+"]].numberOfDocuments:"+index[ indexNumber[ i ] ].numberOfDocuments ); + + weightedIdfPart[ i ] = Math.log( index[ indexNumber[ i ] ].numberOfDocuments / (double)frequency[ i ] ) * index2Weight.getDouble( index[ indexNumber[ i ] ] ); + //System.out.println("weightedIdfPart[i]: " + weightedIdfPart[ i ]); + logger.info("weightedIdfPart["+i+"]: " + weightedIdfPart[ i ]); + } + size = new int[ index.length ]; + + + } + +private ArrayList extractAnnTypesFromQuery(String query) { + //System.out.println("Annotation: " + annotation); + ArrayList annTypes = new ArrayList(); + + String patternType = "\\btype = ([A-Za-z]+)"; + if(query!=null && query.length()>0){ + + + Pattern r = Pattern.compile(patternType); + Matcher m = r.matcher(query); + while (m.find()) { + annTypes.add(m.group(1)); + } + } + + return annTypes; +} + +protected QueryExecutor underlyingExecutor; + + + + public long nextDocument(long greaterThan) throws IOException { + return underlyingExecutor.nextDocument(greaterThan); + } + + public Binding nextHit() throws IOException { + return underlyingExecutor.nextHit(); + } + + +} diff --git a/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/creole.xml b/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/creole.xml new file mode 100644 index 0000000..2a2d21b --- /dev/null +++ b/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/creole.xml @@ -0,0 +1 @@ + diff --git a/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/log4j.properties b/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/log4j.properties new file mode 100644 index 0000000..44e8045 --- /dev/null +++ b/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/log4j.properties @@ -0,0 +1,13 @@ +# Root logger option +log4j.rootLogger=DEBUG, console, file + +# console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=[%t] %-5p %c %x - %m%n + +# file +log4j.appender.file=org.apache.log4j.RollingFileAppender +log4j.appender.file.layout=org.apache.log4j.PatternLayout +log4j.appender.file.layout.ConversionPattern=[%t] %-5p %c %x - %m%n +log4j.appender.file.File=C:/Benutzer/Felicitas_adm/mimir/log.txt \ No newline at end of file diff --git a/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/logback.xml b/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/logback.xml new file mode 100644 index 0000000..0d2b383 --- /dev/null +++ b/Mimir6.2/plugins/TaxonomicScorer/src/main/resources/logback.xml @@ -0,0 +1,39 @@ + + + + + logs/output.log + + + logs/output-%d{yyyy-MM-dd}.%i.log + + + 100MB + + + + + %d - %-5level %logger{36} - %msg %n + + false + + + + + + %d - %-5level %logger{36} - %msg %n + + + + + + + + + + + + + + + diff --git a/Mimir6.2/webapp/mimir-cloud/grails-app/services/gate/mimir/cloud/MimirScorerService.groovy b/Mimir6.2/webapp/mimir-cloud/grails-app/services/gate/mimir/cloud/MimirScorerService.groovy new file mode 100644 index 0000000..f46b341 --- /dev/null +++ b/Mimir6.2/webapp/mimir-cloud/grails-app/services/gate/mimir/cloud/MimirScorerService.groovy @@ -0,0 +1,48 @@ +package gate.mimir.cloud + +import it.unimi.di.big.mg4j.search.score.BM25Scorer +import it.unimi.di.big.mg4j.search.score.CountScorer +import it.unimi.di.big.mg4j.search.score.TfIdfScorer +import gate.mimir.search.score.BindingScorer +import gate.mimir.search.score.DelegatingScoringQueryExecutor as DSQE +import gate.mimir.search.score.MimirScorer +import gate.mimir.web.ScorerSource +import java.util.concurrent.Callable + +import de.unijena.cs.fusion.score.HCFIDFScorer +import de.unijena.cs.fusion.score.TaxonomicScorer +import de.unijena.cs.fusion.score.CFIDFExactScorer +import de.unijena.cs.fusion.slib.SML + + +/** + * Service that takes a scorer name and returns the corresponding scorer + * source (a Callable that returns an appropriate new scorer when called). + */ +class MimirScorerService implements ScorerSource { + + SML sml + + @Override + public Callable scorerForName(String name) { + return scorers[name] + } + + @Override + public Collection scorerNames() { + return scorers.keySet() + } + + public init(){ + sml = new SML() + } + def scorers = [ + 'Count Scoring': { -> new DSQE(new CountScorer()) }, + 'TF.IDF': { -> new DSQE(new TfIdfScorer()) }, + 'BM25': { -> new DSQE(new BM25Scorer()) }, + 'Hit Length Scoring': { -> new BindingScorer() }, + 'HCF-IDF Scorer': { -> new HCFIDFScorer(sml) }, + 'Taxonomic Scorer': { -> new TaxonomicScorer(sml) }, + 'CFIDFExact Scorer': { -> new CFIDFExactScorer() } + ] +} diff --git a/MimirSearchAPI/pom.xml b/MimirSearchAPI/pom.xml new file mode 100644 index 0000000..08737f8 --- /dev/null +++ b/MimirSearchAPI/pom.xml @@ -0,0 +1,50 @@ + + 4.0.0 + + de.unijena.cs.fusion + mimirSearchAPI + 0.0.1 + jar + + mimirTest + http://maven.apache.org + + + + junit + junit + 4.12 + + + + + ch.qos.logback + logback-classic + 1.2.2 + + + + org.slf4j + slf4j-api + 1.7.30 + + + + org.apache.httpcomponents + httpclient + 4.5.13 + + + + org.apache.httpcomponents + httpasyncclient + 4.1.4 + + + + + + 1.8 + 1.8 + + \ No newline at end of file diff --git a/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Document.java b/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Document.java new file mode 100644 index 0000000..133c210 --- /dev/null +++ b/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Document.java @@ -0,0 +1,130 @@ +package de.unijena.cs.fusion.mimir; + +import java.util.ArrayList; + +public class Document { + + //ID + String documentID; + + + //title + String title; + + // document rank (in search result) + int rank; + + //score + double score; + + // document text snippet for display (contains the hit and surrounding tokens) + String documentText; + + String[] documentTextArray; + + // array for collecting the hits (text) for highlighting + String[] highlighting; + + // hits array with hits (documentId, termPosition, tokenLength) + ArrayList hits; + + public void Document () { + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public int getRank() { + return rank; + } + + public void setRank(int rank) { + this.rank = rank; + } + + public String getDocumentID() { + return documentID; + } + + public void setDocumentID(String documentID) { + this.documentID = documentID; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + public String getDocumentText() { + return documentText; + } + + public void setDocumentText(String documentText) { + this.documentText = documentText; + } + + public String[] getDocumentTextArray() { + return documentTextArray; + } + + public void setDocumentTextArray(String[] documentTextArray) { + this.documentTextArray = documentTextArray; + } + + public String[] getHighlighting() { + return highlighting; + } + + public void setHighlighting(String[] highlighting) { + this.highlighting = highlighting; + } + + public ArrayList getHits() { + return hits; + } + + public void setHits(ArrayList hits) { + this.hits = hits; + } + + public ArrayList getTermPosOfHits(){ + ArrayList termPos = new ArrayList(); + + if (this.hits.size() > 0) { + this.hits.stream().forEach(hit -> { + // let i = 1; + termPos.add(Integer.valueOf(hit.getTermPosition())); + + }); + } + // console.log(termPos); + return termPos; + + } + + public Hit getHitByTermPos(String termPos){ + Hit hit = new Hit(); + + if (this.hits.size() > 0) { + this.hits.stream().forEach(h -> { + if (h.getTermPosition() == termPos) { + hit.setDocumentId(h.getDocumentId()); + hit.setTermPosition(h.getTermPosition()); + hit.setLength(h.getLength()); + hit.setHitText(h.getHitText()); + hit.setSnippet(h.getSnippet()); + + } + }); + } + return hit; + } +} diff --git a/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Hit.java b/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Hit.java new file mode 100644 index 0000000..094ce7d --- /dev/null +++ b/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Hit.java @@ -0,0 +1,50 @@ +package de.unijena.cs.fusion.mimir; + +public class Hit { + String documentId; + String termPosition; + String length; + String hitText; + String snippet; + + public Hit (){ + + } + public Hit (String documentId, String termPosition, String length, String hitText){ + this.documentId = documentId; + this.termPosition = termPosition; + this.length = length; + this.hitText = hitText; + } + public String getDocumentId() { + return documentId; + } + public void setDocumentId(String documentId) { + this.documentId = documentId; + } + public String getTermPosition() { + return termPosition; + } + public void setTermPosition(String termPosition) { + this.termPosition = termPosition; + } + public String getLength() { + return length; + } + public void setLength(String length) { + this.length = length; + } + public String getHitText() { + return hitText; + } + public void setHitText(String hitText) { + this.hitText = hitText; + } + public String getSnippet() { + return snippet; + } + public void setSnippet(String snippet) { + this.snippet = snippet; + } +} + diff --git a/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Metadata.java b/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Metadata.java new file mode 100644 index 0000000..6cc3d60 --- /dev/null +++ b/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/Metadata.java @@ -0,0 +1,24 @@ +package de.unijena.cs.fusion.mimir; + +public class Metadata { + + String title; + + String documentId; + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getDocumentId() { + return documentId; + } + + public void setDocumentId(String documentId) { + this.documentId = documentId; + } +} diff --git a/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/MimirSearch.java b/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/MimirSearch.java new file mode 100644 index 0000000..15d437a --- /dev/null +++ b/MimirSearchAPI/src/main/java/de/unijena/cs/fusion/mimir/MimirSearch.java @@ -0,0 +1,737 @@ +package de.unijena.cs.fusion.mimir; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URLEncoder; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.xml.ws.Response; + +import org.apache.http.HttpEntity; +import org.apache.http.HttpHeaders; +import org.apache.http.HttpResponse; +import org.apache.http.NameValuePair; +import org.apache.http.ParseException; +//import org.apache.http.client.HttpClient; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.entity.UrlEncodedFormEntity; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.concurrent.FutureCallback; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.nio.client.CloseableHttpAsyncClient; +import org.apache.http.impl.nio.client.HttpAsyncClients; +import org.apache.http.message.BasicNameValuePair; +import org.apache.http.util.EntityUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class MimirSearch { + + private static final Logger logger = LoggerFactory.getLogger(MimirSearch.class); + + public String INDEX_URL; + + private String DOCSCOUNT_URL; + + private String QUERY_URL; + + private String DOCUMENT_HITS_URL; + + private String DOCUMENT_METADATA_URL; + + private String DOCUMENT_SCORE_URL; + + private String CLOSE_URL; + + private URI RENDER_DOCUMENT_URL = null; + + public String query; + public String queryId; + public long docsCount; //number o f retrieved documents + public int maxDocs = 1000; //maximum number of documents to be considered for ranking (default: 50), '0' means - consider all results, no maxDocs + + int timeoutSeconds = 10; + + RequestConfig requestConfig; + + public MimirSearch(String indexURL){ + + this.INDEX_URL = indexURL; + + DOCSCOUNT_URL = INDEX_URL + "documentsCount"; + + QUERY_URL = INDEX_URL + "postQuery"; + + DOCUMENT_HITS_URL = INDEX_URL + "documentHits"; + + DOCUMENT_METADATA_URL = INDEX_URL + "documentMetadata"; + + DOCUMENT_SCORE_URL = INDEX_URL + "documentScore"; + + CLOSE_URL = INDEX_URL + "close"; + + int CONNECTION_TIMEOUT_MS = timeoutSeconds * 1000; // Timeout in millis. + + requestConfig = RequestConfig.custom() + .setConnectionRequestTimeout(CONNECTION_TIMEOUT_MS) + .setConnectTimeout(CONNECTION_TIMEOUT_MS) + .setSocketTimeout(CONNECTION_TIMEOUT_MS) + .build(); + + } + + + public ArrayList search(String query){ + ArrayList documents = new ArrayList(); + + this.query = query; + + logger.info("***** POST "+QUERY_URL+": queryString:'"+query +"'****"); + + try { + //1.) post the query and get the session ID + String postQueryResult = postQueryAsync(QUERY_URL, this.query); + + //String postQueryResult= postQuery(QUERY_URL, this.query); + this.queryId = readQueryIdFromXML(postQueryResult); + + //it can take a couple of seconds, sometimes minutes for the statement to come back + //sleep and try it again after 5 seconds + if (this.queryId == null || this.queryId.length() == 0){ + + try + { + Thread.sleep(5*1000); // try it again after 5 seconds + } + catch(InterruptedException ex) + { + Thread.currentThread().interrupt(); + } + //second try + postQueryResult= postQueryAsync(QUERY_URL, this.query); + this.queryId = readQueryIdFromXML(postQueryResult); + } + + + logger.info("queryId: "+ this.queryId); + + //2.) next, get the number of documents that match the query = docsCount + this.docsCount = documentsCountAsync(DOCSCOUNT_URL, this.queryId); + + //while docsCount == -1 poll every 5 sec (query still running) + while (this.docsCount < 0){ + + try + { + Thread.sleep(5 * 1000); // try it every 5 seconds until result is either 0 (no documents) result is > 0 + } + catch(InterruptedException ex) + { + Thread.currentThread().interrupt(); + } + //try it again + this.docsCount = documentsCountAsync(DOCSCOUNT_URL,this.queryId); + + } + + logger.info("docsCount: "+ this.docsCount); + + //3.) now, interate through all ranks and get the hits per document + + if (this.docsCount >0){ + int maxCount = new Long(docsCount).intValue(); //by default consider all retrieved documents for ranking + + //but if we have a large corpus and docsCount is larger than the max number of documents to be scored + //limit the scoring of maxDocs + if(this.maxDocs>0 && this.docsCount > this.maxDocs) { + maxCount = this.maxDocs; + } + + for (int i=0; i< maxCount; i++){ + + Document doc = new Document(); + doc.setRank(i); + String rankS = String.valueOf(i); + + //get the hits per documents + + ArrayList hitsPerRank = documentHits(this.queryId, rankS); + doc.setHits(hitsPerRank); + // ToDo: collect the other document data: title, text snippets, text for highlighting + // parallel async calls + Metadata documentMetadata = documentMetadataAsync(this.queryId, rankS); + if(documentMetadata.getTitle()!=null) + doc.setDocumentID(documentMetadata.getDocumentId()); + + if(documentMetadata.getTitle()!=null) + doc.setTitle(documentMetadata.getTitle()); + + double score = documentScore(this.queryId, rankS); + + if(score > 0.0) + doc.setScore(score); + + documents.add(doc); + + } + } + + // 4.) close the session + close(this.queryId); + + } catch (IOException | InterruptedException | ExecutionException | TimeoutException e) { + e.printStackTrace(); + } + return documents; + } + + /** + * helper method to read query from xml result + * @param postQueryResult + * @return + */ + private String readQueryIdFromXML(String postQueryResult) { + String queryId = ""; + + if (postQueryResult == null) { + return queryId; + } else { + String pattern = "[0-9,A-Z,a-z,-]{36}"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(postQueryResult); + if (m.find( )) { + + queryId = m.group(0); + + return queryId; + }else { + logger.error(postQueryResult); + } + } + return queryId; + } + + + /** + * posts query to Mimir URL + * @param queryString as String + * @return queryID as String + * @throws IOException + */ + public String postQuery(String url, String query) throws IOException{ + String result = ""; + String urlString = url + "?queryString="+encode(query); + + logger.info(urlString); + + HttpGet post = new HttpGet(urlString); + + + + // add request parameters or form parameters + List urlParameters = new ArrayList<>(); + urlParameters.add(new BasicNameValuePair("queryString", encode(query))); + + //post.setEntity(new UrlEncodedFormEntity(urlParameters)); + + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + + CloseableHttpResponse response = httpClient.execute(post)){ + + logger.info(result); + + result = EntityUtils.toString(response.getEntity()); + + httpClient.close(); + } + + return result; + } + + private String postQueryAsync (String url, String query) throws IOException, InterruptedException, ExecutionException { + String result = ""; + + CloseableHttpAsyncClient httpclient = HttpAsyncClients.createDefault(); + try { + httpclient.start(); + String urlString = url + "?queryString=" + encode(query); + logger.info(urlString); + HttpGet request = new HttpGet(urlString); + + Future future = httpclient.execute(request, null); + HttpResponse response = future.get(); + + result = EntityUtils.toString(response.getEntity()); + logger.info("Response Status: " + response.getStatusLine()); + logger.info("Response: " + result); + logger.info("Shutting down"); + } finally { + httpclient.close(); + } + logger.info("Done"); + + return result; + } + + // Method to encode a string value using `UTF-8` encoding scheme + private static String encode(String value) { + try { + return URLEncoder.encode(value, StandardCharsets.UTF_8.toString()); + } catch (UnsupportedEncodingException ex) { + throw new RuntimeException(ex.getCause()); + } + } + + + public long documentsCountAsync(String url, String queryId) throws IOException, InterruptedException, ExecutionException, TimeoutException{ + long docsCount = -1; + + if (queryId!=null && queryId!=""){ + + CloseableHttpAsyncClient httpclient = HttpAsyncClients.createDefault(); + try { + httpclient.start(); + String urlString = url + "?queryId=" + queryId; + logger.info(urlString); + HttpGet request = new HttpGet(urlString); + + + Future future = httpclient.execute(request, null); + + + + while(!future.isDone()) { + System.out.println("Task completion in progress..."); + Thread.sleep(500); + } + + System.out.println("Task completed!"); + + HttpResponse response = future.get(10, TimeUnit.SECONDS); + String result = EntityUtils.toString(response.getEntity()); + docsCount = readDocsCountFromXML(result); + + + + //logger.info("Response Status: " + response.getStatusLine()); + //logger.info("Response: " + result); + logger.info("Shutting down"); + } finally { + httpclient.close(); + } + logger.info("Done"); + + } + + return docsCount; + } + + public long documentsCount(String queryId) throws IOException{ + long docsCount = -1; + + if (queryId!=null && queryId!=""){ + + HttpPost post = new HttpPost(DOCSCOUNT_URL); + + + post.setConfig(requestConfig); + + // add request parameters or form parameters + List urlParameters = new ArrayList<>(); + urlParameters.add(new BasicNameValuePair("queryId", queryId)); + + post.setEntity(new UrlEncodedFormEntity(urlParameters)); + + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(post)){ + + String result = EntityUtils.toString(response.getEntity()); + + logger.info(result); + + docsCount = readDocsCountFromXML(result); + + httpClient.close(); + } + + + + + } + return docsCount; + } + + + private long readDocsCountFromXML(String getDocsCountResult) { + long docsCount = -1; + + if (getDocsCountResult == null) { + return docsCount; + } else { + String pattern = "([0-9]+)<\\/value>"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(getDocsCountResult); + if (m.find( )) { + + String count = m.group(1); + if(count!=null) { + docsCount = Long.valueOf(count); + } + + return docsCount; + }else { + logger.error("No docsCount found in "+ getDocsCountResult); + } + } + return docsCount; + } + + + private ArrayList documentHits(String queryId, String rank) throws IOException{ + ArrayList hits = new ArrayList(); + + if (queryId!=null && queryId!=""){ + + HttpPost post = new HttpPost(DOCUMENT_HITS_URL); + + /*int CONNECTION_TIMEOUT_MS = timeoutSeconds * 1000; // Timeout in millis. + RequestConfig requestConfig = RequestConfig.custom() + .setConnectionRequestTimeout(CONNECTION_TIMEOUT_MS) + .setConnectTimeout(CONNECTION_TIMEOUT_MS) + .setSocketTimeout(CONNECTION_TIMEOUT_MS) + .build();*/ + + //post.setConfig(requestConfig); + + // add request parameters or form parameters + List urlParameters = new ArrayList<>(); + urlParameters.add(new BasicNameValuePair("queryId", queryId)); + urlParameters.add(new BasicNameValuePair("rank", rank)); + + post.setEntity(new UrlEncodedFormEntity(urlParameters)); + + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(post)){ + + String result = EntityUtils.toString(response.getEntity()); + + hits = readHitsFromXML(result); + + httpClient.close(); + } + + + + + } + return hits; + } + + + private ArrayList readHitsFromXML(String hitsFromXML) { + + ArrayList hits = new ArrayList(); + + if (hitsFromXML == null) { + logger.error ("readHitsFromXML: result is null"); + } else { + + String patternHits = ".+<\\/hits>"; + Pattern r = Pattern.compile(patternHits); + Matcher m = r.matcher(hitsFromXML); + if (m.find( )) { + + String temp = m.group(0); //all hits in a list + + String patternHit = "()"; + /** + * hitsArray[0] - full string of characters matched + * hitsArray[1] - first substring match: () + * hitsArray[2] - second substring match: (documentId='[0-9]+') + * hitsArray[3] - third substring match: (termPosition='[0-9]+') + * hitsArray[4] - fourth substring match: (length='[0-9]+') + */ + Pattern rHit = Pattern.compile(patternHit); + Matcher mHit = rHit.matcher(temp); + + while(mHit.find()){ + Hit hit = new Hit(); + hit.setDocumentId(mHit.group(2)); + hit.setTermPosition(mHit.group(3)); + hit.setLength(mHit.group(4)); + + hits.add(hit); + } + + }else { + logger.error(hitsFromXML); + } + } + + return hits; + } + + + private Metadata documentMetadata(String queryId, String rank) throws IOException{ + + Metadata metadata = new Metadata(); + + if (queryId!=null && queryId!=""){ + + HttpPost post = new HttpPost(DOCUMENT_METADATA_URL); + + /*int CONNECTION_TIMEOUT_MS = timeoutSeconds * 1000; // Timeout in millis. + RequestConfig requestConfig = RequestConfig.custom() + .setConnectionRequestTimeout(CONNECTION_TIMEOUT_MS) + .setConnectTimeout(CONNECTION_TIMEOUT_MS) + .setSocketTimeout(CONNECTION_TIMEOUT_MS) + .build();*/ + + post.setConfig(requestConfig); + + + // add request parameters or form parameters + List urlParameters = new ArrayList<>(); + urlParameters.add(new BasicNameValuePair("queryId", queryId)); + urlParameters.add(new BasicNameValuePair("rank", rank)); + + post.setEntity(new UrlEncodedFormEntity(urlParameters)); + + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(post)){ + + String result = EntityUtils.toString(response.getEntity()); + + metadata.setTitle(readTitleFromXML(result)); + metadata.setDocumentId(readDocumentIdFromResult(result)); + + httpClient.close(); + } + + + + + } + return metadata; + } + +private Metadata documentMetadataAsync(String queryId, String rank) throws IOException{ + + Metadata metadata = new Metadata(); + + if (queryId!=null && queryId!=""){ + HttpPost post = new HttpPost(DOCUMENT_METADATA_URL); + + + //post.setConfig(requestConfig); + + + // add request parameters or form parameters + List urlParameters = new ArrayList<>(); + urlParameters.add(new BasicNameValuePair("queryId", queryId)); + urlParameters.add(new BasicNameValuePair("rank", rank)); + + post.setEntity(new UrlEncodedFormEntity(urlParameters)); + CloseableHttpAsyncClient httpclient = HttpAsyncClients.createDefault(); + try { + httpclient.start(); + //String urlString = url + "?queryId=" + queryId; + //logger.info(urlString); + //HttpPost request = new HttpPost(urlString); + + + Future future = httpclient.execute(post, null); + + + + /*while(!future.isDone()) { + System.out.println("Task completion in progress..."); + Thread.sleep(500); + } + + System.out.println("Task completed!");*/ + + //HttpResponse response = future.get(10, TimeUnit.SECONDS); + HttpResponse response = future.get(); + String result = EntityUtils.toString(response.getEntity()); + + metadata.setTitle(readTitleFromXML(result)); + metadata.setDocumentId(readDocumentIdFromResult(result)); + + + //logger.info("Response Status: " + response.getStatusLine()); + //logger.info("Response: " + result); + // logger.info("Shutting down"); + + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (ExecutionException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + }/* catch (TimeoutException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } */ finally { + httpclient.close(); + } + //logger.info("Done"); + + } + + + return metadata; + } + + private String readDocumentIdFromResult(String documentIdFromXML) { + //([0-9]+)<\/documentTitle> + String docId = ""; + + if (documentIdFromXML == null) { + return null; + } else { + String pattern = "(.+)<\\/documentURI>"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(documentIdFromXML); + if (m.find( )) { + + docId = m.group(1); + return docId; + }else { + logger.error("No ID found in "+ documentIdFromXML); + } + } + return null; + } + + + private String readTitleFromXML(String titleFromXML) { + + //([0-9]+)<\/documentTitle> + String title = ""; + + if (titleFromXML == null) { + return null; + } else { + String pattern = "(.*)<\\/documentTitle>"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(titleFromXML); + if (m.find( )) { + + title = m.group(1); + return title; + }else { + logger.error("No title found in "+ titleFromXML); + } + } + return null; + } + + private double documentScore(String queryId, String rank) throws IOException{ + + double score = 0.0; + + if (queryId!=null && queryId!=""){ + + HttpPost post = new HttpPost(DOCUMENT_SCORE_URL); + + + post.setConfig(requestConfig); + + // add request parameters or form parameters + List urlParameters = new ArrayList<>(); + urlParameters.add(new BasicNameValuePair("queryId", queryId)); + urlParameters.add(new BasicNameValuePair("rank", rank)); + + post.setEntity(new UrlEncodedFormEntity(urlParameters)); + + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(post)){ + + String result = EntityUtils.toString(response.getEntity()); + + score = readScoreFromXML(result); + + httpClient.close(); + } + + + + + } + return score; + } + + + private double readScoreFromXML(String scoreFromXML) { + + + if (scoreFromXML == null) { + return 0.0; + } else { + String pattern = "(.*)<\\/value>"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(scoreFromXML); + if (m.find( )) { + + return Double.valueOf(m.group(1)); + + }else { + logger.error("No score found in "+ scoreFromXML); + } + } + return 0.0; + } + + + + private Metadata close(String queryId) throws IOException{ + + Metadata metadata = new Metadata(); + + if (queryId!=null && queryId!=""){ + + HttpPost post = new HttpPost(CLOSE_URL); + + post.setConfig(requestConfig); + // add request parameters or form parameters + List urlParameters = new ArrayList<>(); + urlParameters.add(new BasicNameValuePair("queryId", queryId)); + + post.setEntity(new UrlEncodedFormEntity(urlParameters)); + + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(post)){ + + String result = EntityUtils.toString(response.getEntity()); + + logger.info("Session with queryID " + queryId + " closed. " + result); + + httpClient.close(); + } + + + + + } + return metadata; + } + + +} diff --git a/MimirSearchAPI/src/main/resources/logback.xml b/MimirSearchAPI/src/main/resources/logback.xml new file mode 100644 index 0000000..d58e27c --- /dev/null +++ b/MimirSearchAPI/src/main/resources/logback.xml @@ -0,0 +1,38 @@ + + + + + logs/output.log + + + logs/output-%d{yyyy-MM-dd}.%i.log + + + 100MB + + + + + %d - %-5level %logger{36} - %msg %n + + false + + + + + + %d - %-5level %logger{36} - %msg %n + + + + + + + + + + + + + + diff --git a/MimirSearchAPI/src/test/java/test/MimirSearchTest.java b/MimirSearchAPI/src/test/java/test/MimirSearchTest.java new file mode 100644 index 0000000..2e37459 --- /dev/null +++ b/MimirSearchAPI/src/test/java/test/MimirSearchTest.java @@ -0,0 +1,51 @@ +package test; + +import java.util.ArrayList; + +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.unijena.cs.fusion.mimir.Document; +import de.unijena.cs.fusion.mimir.MimirSearch; + +public class MimirSearchTest { + + private static final Logger logger = LoggerFactory.getLogger(MimirSearch.class); + + @Test + public void SimpleSearch() throws Exception { + + //provide a valid index URL + String INDEX_URL = "http://localhost:8080/mimir-cloud-6.2-SNAPSHOT//search/"; + + MimirSearch searchSession = new MimirSearch(INDEX_URL); + + ArrayList docs = searchSession.search("quercus"); + + + docs.stream().forEach(doc->{ + logger.info("Rank: " + doc.getRank() + ", Score: "+doc.getScore() + ", Title: "+doc.getTitle() + ", Hits: "+ doc.getHits().size()); + }); + + + } + + @Test + public void AnnotationSearch() throws Exception { + + //provide a valid index URL + String INDEX_URL = "http://localhost:8080/mimir-cloud-6.2-SNAPSHOT//search/"; + + MimirSearch searchSession = new MimirSearch(INDEX_URL); + + ArrayList docs = searchSession.search("{Organism} AND {Environment}"); + + + docs.stream().forEach(doc->{ + logger.info("Rank: " + doc.getRank() + ", Score: "+doc.getScore() + ", Title: "+doc.getTitle() + ", Hits: "+ doc.getHits().size()); + }); + + + } +} diff --git a/MimirTest/mimirTest/pom.xml b/MimirTest/mimirTest/pom.xml new file mode 100644 index 0000000..5384d1a --- /dev/null +++ b/MimirTest/mimirTest/pom.xml @@ -0,0 +1,53 @@ + + 4.0.0 + + de.unijena.cs.fusion + mimirTest + 0.0.1-SNAPSHOT + jar + + mimirTest + http://maven.apache.org + + + UTF-8 + 1.8 + 1.8 + + + + + junit + junit + 4.13 + test + + + + ch.qos.logback + logback-classic + 1.2.2 + + + + org.slf4j + slf4j-api + 1.7.30 + + + + de.unijena.cs.fusion + mimirSearchAPI + 0.0.1 + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-csv + 2.10.3 + + + + + diff --git a/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/EvaluationThread.java b/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/EvaluationThread.java new file mode 100644 index 0000000..09a9afd --- /dev/null +++ b/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/EvaluationThread.java @@ -0,0 +1,200 @@ +package de.unijena.cs.fusion.mimirTest; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.unijena.cs.fusion.mimir.Document; +import de.unijena.cs.fusion.mimir.MimirSearch; + +public class EvaluationThread implements Runnable { + + private static final Logger logger = LoggerFactory.getLogger(EvaluationThread.class); + + + + String threadName; + String corpusName; + String indexURL; + long totalNumDocsInIndex; + ArrayList queries; + HashMap> groundTruth; + String outputPath; + + EvaluationThread(String threadName, String corpusName, String indexURL, long totalNumDocsInIndex, ArrayList queries, HashMap> groundTruth, String outputPath ) { + this.threadName = threadName; + this.corpusName = corpusName; + this.totalNumDocsInIndex = totalNumDocsInIndex; + this.queries = queries; + this.indexURL = indexURL; + this.outputPath = outputPath; + this.groundTruth = groundTruth; + } + + + public void run() { + + if(getQueries()!=null && getQueries().size()>0){ + int i = 1; + + for(Iterator it = getQueries().iterator();it.hasNext();){ + String next = (String) it.next(); + + //leave queries out that start with '#' - commented + if(!next.startsWith("#")){ + Query query = new Query(i, next); + + logger.info("["+getCorpusName()+ "-" + getThreadName()+ "] Query: " +query.getQuery() +" ***"); + + MimirSearch searchSession = new MimirSearch(getIndexURL()); + + ArrayList docs = searchSession.search(query.getQuery()); + + + //save results + try { + writeToFile(docs, new Integer(query.getID()).toString()); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + docs.stream().forEach(doc->{ + logger.info("["+getCorpusName()+ "-" + getThreadName()+"] Rank: " + doc.getRank() + ", Score: "+doc.getScore() + ", Title: "+doc.getTitle() + ", Hits: "+ doc.getHits().size()); + + + }); + + + } + i++; + } + + try { + + logger.info("["+getCorpusName()+ "-" + getThreadName()+"] Computing metrics started ..."); + + + logger.info("["+getCorpusName()+ "-" + getThreadName()+"] Computing metrics finished. Results saved in " + getOutputPath()+"\\"+getCorpusName()+"_"+getThreadName()+".csv"); + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + } + + /** + * write result to a file in TREC format - TOPIC_NO Q0 ID RANK SCORE RUN_NAME + TOPIC_NO is the topic number (1–30), + 0 is a required but ignored constant, + ID is the identifier of the retrieved document (PMID or NCT ID), + RANK is the rank (1–1000) of the retrieved document, + SCORE is a floating point value representing the similarity score of the document, + and RUN_NAME is an identifier for the run. The RUN_NAME is limited to 12 alphanumeric characters (no punctuation). + * @param docs ArrayList, queryID int + * @throws IOException + */ + private void writeToFile(ArrayList docs, String queryId) throws IOException{ + + if(docs!=null) { + BufferedWriter writer = new BufferedWriter(new FileWriter(getOutputPath() + "/" + getCorpusName()+"_"+getThreadName()+"_files.txt", true)); + + docs.stream().forEach(doc->{ + logger.info("["+getCorpusName()+ "-" + getThreadName()+"] Rank: " + doc.getRank() + ", Score: "+doc.getScore() + ", Title: "+doc.getTitle() + ", Hits: "+ doc.getHits().size()); + try { + writer.append(queryId); + writer.append(' '); + writer.append("Q0"); + writer.append(' '); + writer.append(doc.getTitle()); + writer.append(' '); + writer.append(new Integer(doc.getRank()).toString()); + writer.append(' '); + writer.append(new Double(doc.getScore()).toString()); + writer.append(' '); + writer.append(getCorpusName()+"_"+getThreadName()); + writer.newLine(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + + }); + + writer.close(); + } + } + + + public String getCorpusName() { + return corpusName; + } + + public void setCorpusName(String corpusName) { + this.corpusName = corpusName; + } + + public long getTotalNumDocsInIndex() { + return totalNumDocsInIndex; + } + + public void setTotalNumDocsInIndex(long totalNumDocsInIndex) { + this.totalNumDocsInIndex = totalNumDocsInIndex; + } + + public ArrayList getQueries() { + return queries; + } + + public void setQueries(ArrayList queries) { + this.queries = queries; + } + + public HashMap> getGroundTruth() { + return groundTruth; + } + + public void setGroundTruth(HashMap> groundTruth) { + this.groundTruth = groundTruth; + } + + public String getIndexURL() { + return indexURL; + } + + public void setIndexURL(String indexURL) { + this.indexURL = indexURL; + } + + public String getOutputPath() { + return outputPath; + } + + public void setOutputPath(String outputPath) { + this.outputPath = outputPath; + } + + public String getThreadName() { + return threadName; + } + + + public void setThreadName(String threadName) { + this.threadName = threadName; + } + + + +} diff --git a/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/MimirTest.java b/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/MimirTest.java new file mode 100644 index 0000000..1f62296 --- /dev/null +++ b/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/MimirTest.java @@ -0,0 +1,413 @@ +package de.unijena.cs.fusion.mimirTest; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectWriter; +import com.fasterxml.jackson.dataformat.csv.CsvMapper; +import com.fasterxml.jackson.dataformat.csv.CsvSchema; + +import de.cs.unijena.fusion.ranking_metrics.Metrics; +import de.cs.unijena.fusion.ranking_metrics.PrecRecF; +import de.cs.unijena.fusion.ranking_metrics.PrecisionAtRank; +import de.cs.unijena.fusion.ranking_metrics.Rating; +import de.cs.unijena.fusion.ranking_metrics.RatingBefchina; +import de.cs.unijena.fusion.ranking_metrics.RatingBioCaddie; +import de.cs.unijena.fusion.ranking_metrics.ResultPerQuery; +import de.unijena.cs.fusion.mimir.Document; +import de.unijena.cs.fusion.mimir.MimirSearch; + + +public class MimirTest +{ + private static final Logger logger = LoggerFactory.getLogger(MimirTest.class); + + /* + * BEF-China settings + */ + static String INDEX_URL_BEFCHINA="http://localhost:8080/mimir-cloud-6.2-SNAPSHOT/f8ec0055-ab4f-40a0-9b4c-d60d982cbdfd/search/"; + + + //GoldStandard BEFChina + private static String PATH_TO_GROUND_TRUTH_BEFCHINA = "F:\\code\\Evaluation\\befchina\\befchina_gold_standard.txt"; + + //output folder path Befchina + private static String OUTPUT_PATH_BEFCHINA = "F:\\code\\RankingResults\\befchina"; + + + //path to URI queries Befchina + private static final String URI_QUERIES_BEFCHINA = "F:\\code\\Evaluation\\befchina\\queries\\URI.txt"; + + + //path to narrower queries Befchina + private static final String URI_NARROWER_QUERIES_BEFCHINA = "F:\\code\\Evaluation\\befchina\\queries\\narrower.txt"; + + //path to simple broader queries Befchina +private static final String SIMPLE_BROADER_QUERIES_BEFCHINA = "F:\\code\\Evaluation\\befchina\\queries\\broader.txt"; + + //path to broader queries Befchina + private static final String CORE_RELATION_QUERIES_BEFCHINA = "F:\\code\\Evaluation\\befchina\\queries\\coreRelation.txt"; + + private static final String ADAPTED_QUERIES_BEFCHINA = "F:\\code\\Evaluation\\befchina\\queries\\adapted.txt"; + + + /* + * BioCADDIE settings + */ + private static String INDEX_URL_BioCADDIE="http://localhost:8080/mimir-cloud-6.2-SNAPSHOT/68b03370-8acb-4381-863d-b800be17bd5d/search/"; + + //GoldStandard BioCaddie + private static String PATH_TO_GROUND_TRUTH_BIOCADDIE = "F:\\code\\Evaluation\\biocaddie\\biocaddie_ground_truth.txt"; + + //output folder path BioCADDIE + private static String OUTPUT_PATH_BIOCADDIE = "F:\\code\\RankingResults\\biocaddie"; + + //path to URI queries BioCADDIE + private static final String URI_QUERIES_BIOCADDIE = "F:\\code\\Evaluation\\biocaddie\\questions\\URI.txt"; + + //path to narrower queries BioCADDIE + private static final String URI_NARROWER_QUERIES_BIOCADDIE = "F:\\code\\Evaluation\\biocaddie\\questions\\narrower.txt"; + + //path to broader queries BioCADDIE + private static final String URI_BROADER_QUERIES_BIOCADDIE = "F:\\code\\Evaluation\\biocaddie\\questions\\broader.txt"; + + //path to partOf queries BioCADDIE + private static final String URI_CORE_RELATIONS_QUERIES_BIOCADDIE = "F:\\code\\Evaluation\\biocaddie\\questions\\coreRelation.txt"; + + private static final String URI_ADAPTED_QUERIES_BIOCADDIE ="F:\\code\\Evaluation\\biocaddie\\questions\\adapted.txt"; + + + + + private static ArrayList broaderQueries; + + private static ArrayList narrowerQueries; + + private static ArrayList URIQueries; + + //private static ArrayList URIBroaderQueries; + + + private static ArrayList coreRelationsQueries; + + private static ArrayList adaptedQueries; + + + private static HashMap> groundTruth = null; + + HashMap> predictionsPerQuery = null; + + //threshold what prediction values to consider as 'relevant' + //by default all prediction values larger than 0 are considered as relevant (binary rating) + //change this value if you have a Likert-scale (0 - not relevant, 1 - partially relevant, 2 - highly relevant) + //for your predictions and only want to consider, e.g., highly relevant values + private static final int RELEVANCE_THRESHOLD = 0; + + + + public static void main( String[] args ) + { + + + if(args[1] != null && args[1].equals("befchina")) { + + DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss"); + LocalDateTime now = LocalDateTime.now(); + System.out.println(dtf.format(now)); + + System.out.println("***** "+dtf.format(now)+" - Evaluation with BEF-China test collection started ..... *****"); + + if(args.length >0) { + OUTPUT_PATH_BEFCHINA = OUTPUT_PATH_BEFCHINA + "\\"+args[0]; + + //init Evaluation, load gold standards and queries + init(args[1],PATH_TO_GROUND_TRUTH_BEFCHINA, URI_QUERIES_BEFCHINA, URI_NARROWER_QUERIES_BEFCHINA, SIMPLE_BROADER_QUERIES_BEFCHINA, null, null, null,null, PARTOF_QUERIES_BEFCHINA,CORE_RELATION_QUERIES_BEFCHINA,ADAPTED_QUERIES_BEFCHINA); + + //Befchina Evaluation + befchinaEvaluation(args[1]); + + now = LocalDateTime.now(); + System.out.println("***** " +dtf.format(now)+" - Evaluation with BEF-China test collection .... done. *****"); + }else { + System.err.println("Please provide a valid output path."); + } + + + } + else if(args[1] != null && args[1].equals("biocaddie")) { + DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss"); + LocalDateTime now = LocalDateTime.now(); + System.out.println(dtf.format(now)); + + OUTPUT_PATH_BIOCADDIE = OUTPUT_PATH_BIOCADDIE + "\\"+args[0]; + + System.out.println("***** "+dtf.format(now)+" - Evaluation with BioCADDIE test collection started ..... *****"); + + + //init Evaluation, load gold standards and queries + //init(args[1],PATH_TO_GROUND_TRUTH_BIOCADDIE, URI_QUERIES_BIOCADDIE, URI_NARROWER_QUERIES_BIOCADDIE, URI_BROADER_QUERIES_BIOCADDIE, URI_BROADER_AND_NARROWER_QUERIES_BIOCADDIE, SIMPLE_BROADER_QUERIES_BIOCADDIE, NARROWER_OF_BROADER_QUERIES_BIOCADDIE,BROADER_OF_INST_QUERIES_BIOCADDIE); + init(args[1],PATH_TO_GROUND_TRUTH_BIOCADDIE, URI_QUERIES_BIOCADDIE, URI_NARROWER_QUERIES_BIOCADDIE, URI_BROADER_QUERIES_BIOCADDIE, null, null, null,null,URI_PART_OF_QUERIES_BIOCADDIE,URI_CORE_RELATIONS_QUERIES_BIOCADDIE,URI_ADAPTED_QUERIES_BIOCADDIE); + + + //BioCaddie Evaluation + biocaddieEvaluation(args[1]); + + now = LocalDateTime.now(); + System.out.println("***** " +dtf.format(now)+" - Evaluation with BioCADDIE test collection .... done. *****"); + }else { + System.err.println("The system only supports the following test collection corpora as second argument: 'befchina' (372 files, domain: biodiversity research) or 'bioCaddie' (~7940000, domain: bio-medicine)'"); + } + } + + + + private static void befchinaEvaluation(String corpusName) { + // total number of documents in the index + long totalNumDocsInIndex = 372; + //String corpusName = "Befchina"; + + + + EvaluationThread simpleURIThread = new EvaluationThread("URIQueries", + corpusName, INDEX_URL_BEFCHINA, totalNumDocsInIndex, URIQueries, groundTruth, + OUTPUT_PATH_BEFCHINA ); + + new Thread(simpleURIThread).start(); + + EvaluationThread narrowerThread = new EvaluationThread("NarrowerQueries", + corpusName, INDEX_URL_BEFCHINA, totalNumDocsInIndex, narrowerQueries, + groundTruth, OUTPUT_PATH_BEFCHINA ); + + new Thread(narrowerThread).start(); + + EvaluationThread broaderThread = new EvaluationThread("BroaderQueries", + corpusName, INDEX_URL_BEFCHINA, totalNumDocsInIndex, broaderQueries, + groundTruth, OUTPUT_PATH_BEFCHINA ); + + new Thread(broaderThread).start(); + + EvaluationThread coreRelationsThread = new EvaluationThread("CoreRelationsQueries", + corpusName, INDEX_URL_BEFCHINA, totalNumDocsInIndex, coreRelationsQueries, groundTruth, OUTPUT_PATH_BEFCHINA ); + new Thread(coreRelationsThread).start(); + + + EvaluationThread adaptedThread = new + EvaluationThread("AdaptedQueries", corpusName, INDEX_URL_BEFCHINA, + totalNumDocsInIndex, adaptedQueries, groundTruth, + OUTPUT_PATH_BEFCHINA ); + + new Thread(adaptedThread).start(); + + + } + + /** + * BioCaddie Evaluation + * please note: as the connection to the git server can be limited, please only run two threads in parallel + * @param corpusName + */ + private static void biocaddieEvaluation(String corpusName) { + + // total number of documents in the index + + long totalNumDocsInIndex = 794983; + + //String corpusName = "BioCaddie"; + + + EvaluationThread simpleURIThread = new EvaluationThread("SimpleURIQueries", corpusName, INDEX_URL_BioCADDIE, totalNumDocsInIndex, URIQueries, groundTruth, OUTPUT_PATH_BIOCADDIE ); + + new Thread(simpleURIThread).start(); + + EvaluationThread narrowerThread = new EvaluationThread("NarrowerQueries", corpusName, INDEX_URL_BioCADDIE, totalNumDocsInIndex, narrowerQueries, groundTruth, OUTPUT_PATH_BIOCADDIE ); + + new Thread(narrowerThread).start(); + + + + EvaluationThread coreRelationsThread = new EvaluationThread("CoreRelationsQueries", corpusName, INDEX_URL_BioCADDIE, totalNumDocsInIndex, coreRelationsQueries, groundTruth, OUTPUT_PATH_BIOCADDIE ); + + new Thread(coreRelationsThread).start(); + + + EvaluationThread broaderThread = new EvaluationThread("BroaderQueries", corpusName, INDEX_URL_BioCADDIE, totalNumDocsInIndex, broaderQueries, groundTruth, OUTPUT_PATH_BIOCADDIE ); + + new Thread(broaderThread).start(); + + EvaluationThread adaptedThread = new + EvaluationThread("AdaptedQueries", corpusName, INDEX_URL_BioCADDIE, + totalNumDocsInIndex, adaptedQueries, groundTruth, + OUTPUT_PATH_BIOCADDIE ); + + new Thread(adaptedThread).start(); + + + } + + + + @SuppressWarnings("unchecked") + private static void init(String corpusName, String pathToGroundTruth, String pathToURIqueries, String pathToNarrowerQueries, String pathToBroaderQueries, String pathToSimpleBroaderQueries, String pathToCoreRelationsQueries,String pathToAdaptedQueries){ + + //read the Benchmark Ratings - ground truth + try (Stream stream = Files.lines(Paths.get(pathToGroundTruth))) { + + if(corpusName.equals("befchina")) { + //2. convert all content to Rating elements + //3. convert it into a Map + groundTruth = (HashMap>) stream + .filter(line -> !"".equals(line)) + .map(line -> createRatingBefchina(line)) + //.collect(Collectors.toMap(r -> r.getQueryId(), r -> r)); + .collect(Collectors.groupingBy(Rating::getQueryId)); + } + else if(corpusName.equals("biocaddie")) { + //2. convert all content to Rating elements + //3. convert it into a Map + ListgroundTruthPreList = (List) stream + .filter(line -> !"".equals(line)) + .map(line -> createRatingCaddie(line)) + .collect(Collectors.toCollection(ArrayList::new)); + //.collect(Collectors.toMap(r -> r.getQueryId(), r -> r)); + //.collect(Collectors.groupingBy(Rating::getQueryId)); + + groundTruth = (HashMap>)groundTruthPreList.stream() + .filter(x -> x.getRating()>RELEVANCE_THRESHOLD) + .collect(Collectors.groupingBy(Rating::getQueryId)); + + System.out.println(groundTruth); + } + + } catch (IOException e) { + e.printStackTrace(); + } + + //read the benchmark queries + try (Stream stream = Files.lines(Paths.get(pathToNarrowerQueries))) { + + + narrowerQueries = (ArrayList) stream + .filter(line -> !"".equals(line)) + .collect(Collectors.toList()); + + } catch (IOException e) { + e.printStackTrace(); + } + + + + //read the benchmark queries + try (Stream stream = Files.lines(Paths.get(pathToURIqueries))) { + + + URIQueries = (ArrayList) stream + .filter(line -> !"".equals(line)) + .collect(Collectors.toList()); + + } catch (IOException e) { + e.printStackTrace(); + } + + //read the benchmark queries + try (Stream stream = Files.lines(Paths.get(pathToBroaderQueries))) { + + + broaderQueries = (ArrayList) stream + .filter(line -> !"".equals(line)) + .collect(Collectors.toList()); + + } catch (IOException e) { + e.printStackTrace(); + } + + + //read the benchmark queries + if (pathToCoreRelationsQueries != null) { + try (Stream stream = Files.lines(Paths.get(pathToCoreRelationsQueries))) { + + + coreRelationsQueries = (ArrayList) stream + .filter(line -> !"".equals(line)) + .collect(Collectors.toList()); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + //read the benchmark queries + if (pathToAdaptedQueries != null) { + try (Stream stream = Files.lines(Paths.get(pathToAdaptedQueries))) { + + + adaptedQueries = (ArrayList) stream + .filter(line -> !"".equals(line)) + .collect(Collectors.toList()); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + + //read the benchmark queries + if(pathToSimpleBroaderQueries != null) { + try (Stream stream = Files.lines(Paths.get(pathToSimpleBroaderQueries))) { + + + URIBroaderQueries = (ArrayList) stream + .filter(line -> !"".equals(line)) + .collect(Collectors.toList()); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + + } + + private static Rating createRatingCaddie(String line) { + String[] ratingArray = line.split("::"); + + return new RatingBioCaddie( + new Integer(ratingArray[0]).intValue(), //first number is the topic/question number (second entry is just '0') + new Integer(ratingArray[2]).intValue(), // third entry denotes the document number + new Integer(ratingArray[3]).intValue()); // fourth number is the prediction + + + } + + + + private static Rating createRatingBefchina(String line){ + + String[] ratingArray = line.split("::"); + return new RatingBefchina( + new Integer(ratingArray[0]).intValue(), //first number is the topic/question + new Integer(ratingArray[1]).intValue(), //second number is the document number + new Integer(ratingArray[2]).intValue()); //third number denotes the prediction + + } + + + + +} diff --git a/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/Query.java b/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/Query.java new file mode 100644 index 0000000..b5785f3 --- /dev/null +++ b/MimirTest/mimirTest/src/main/java/de/unijena/cs/fusion/mimirTest/Query.java @@ -0,0 +1,33 @@ +package de.unijena.cs.fusion.mimirTest; + +public class Query { + int ID; + String query; + + public Query(){ + + } + + public Query(int ID, String query){ + this.ID = ID; + this.query = query; + } + +public int getID() { + return ID; +} + +public void setID(int iD) { + ID = iD; +} + +public String getQuery() { + return query; +} + +public void setQuery(String query) { + this.query = query; +} + + +} diff --git a/MimirTest/mimirTest/src/main/resources/logback.xml b/MimirTest/mimirTest/src/main/resources/logback.xml new file mode 100644 index 0000000..7b2abc8 --- /dev/null +++ b/MimirTest/mimirTest/src/main/resources/logback.xml @@ -0,0 +1,40 @@ + + + + + logs/output.log + + + logs/output-%d{yyyy-MM-dd}.%i.log + + + 100MB + + + + + %d - %-5level %logger{36} - %msg %n + + false + + + + + + %d - %-5level %logger{36} - %msg %n + + + + + + + + + + + + + + + + diff --git a/MimirTest/mimirTest/src/test/java/de/uni_jena/cs/fusion/mimirTest/AppTest.java b/MimirTest/mimirTest/src/test/java/de/uni_jena/cs/fusion/mimirTest/AppTest.java new file mode 100644 index 0000000..6f9f48c --- /dev/null +++ b/MimirTest/mimirTest/src/test/java/de/uni_jena/cs/fusion/mimirTest/AppTest.java @@ -0,0 +1,38 @@ +package de.uni_jena.cs.fusion.mimirTest; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +/** + * Unit test for simple App. + */ +public class AppTest + extends TestCase +{ + /** + * Create the test case + * + * @param testName name of the test case + */ + public AppTest( String testName ) + { + super( testName ); + } + + /** + * @return the suite of tests being tested + */ + public static Test suite() + { + return new TestSuite( AppTest.class ); + } + + /** + * Rigourous Test :-) + */ + public void testApp() + { + assertTrue( true ); + } +} diff --git a/slib-sml/src/main/java/slib/sml/sm/core/engine/SM_Engine.java b/slib-sml/src/main/java/slib/sml/sm/core/engine/SM_Engine.java new file mode 100644 index 0000000..76c1f06 --- /dev/null +++ b/slib-sml/src/main/java/slib/sml/sm/core/engine/SM_Engine.java @@ -0,0 +1,84 @@ + + +/***** inside the SM_Engine class add the following methods to compute the node level *****/ + + /** + * Get hierarchy level of a current vertex + * assumption: all vertexes in that list are on the same hierarchy level + * @param v + * @return the hierarchy level of the given vertex + * @throws SLIB_Ex_Critic + */ + public long getNodeLevel(URI v) throws SLIB_Ex_Critic { + + throwErrorIfNotClass(v); + + long level = 0; + Set a = getAncestorsInc(v); + + Set ancestors = new HashSet(); + + //getAncestorsInc returns an unmodifiable Set, so we need to copy the Set into a new one + //getAncestorsInc also contains the vertex itself - remove it + for(URI uri : a){ + if(!uri.equals(v)) + //System.out.println(uri); + ancestors.add(uri); + } + + Set parents; + Set vertexes = new HashSet(); + vertexes.add(v); + boolean stop=false; + + //as long as we have ancestors + while(ancestors.size()>0 && stop==false){ + + //per hierarchy level +1 + level++; + //System.out.println("---------Level: " + level+"------"); + for(URI uri : vertexes){ + parents = topNodeAccessor.getNeighbors(uri); + parents.remove(v); + + /*for(URI parent : parents){ + System.out.println("Parent: " + parent); + }*/ + + if(parents!=null && parents.size()>0){ + for(URI p : parents){ + if(ancestors.contains(p)){ + //remove all the parents from the ancestors (this hierarchy level has been counted already) + ancestors.remove(p); + } + } + //but analyze if the parents have parents (= grandparents) + vertexes = parents; + }else{ + //no parents left - stop, set ancestors to size=0 + stop=true; + } + } + + /*for(URI anc : ancestors){ + System.out.println("Ancestor: " + anc); + }*/ + } + + return level; + } + + public Map computeNodeLevel() throws SLIB_Ex_Critic { + Map nodes = new HashMap(); + + Set allClasses = this.getClasses(); + + for(URI uri : allClasses){ + //System.out.println(uri); + nodes.put(uri,getNodeLevel(uri)); + } + + return nodes; + } + + \ No newline at end of file diff --git a/slibAPI/pom.xml b/slibAPI/pom.xml new file mode 100644 index 0000000..ddddfc3 --- /dev/null +++ b/slibAPI/pom.xml @@ -0,0 +1,203 @@ + + 4.0.0 + de.unijena.cs.fusion + slibAPI + 0.0.1 + + + + UTF-8 + 1.8 + 1.8 + + + + + + + + com.github.sharispe + slib-graph + 0.9.5 + pom + + + ch.qos.logback + logback-classic + + + org.slf4j + log4j-over-slf4j + + + org.slf4j + jcl-over-slf4j + + + org.slf4j + slf4j-log4j12 + + + + + + + com.github.sharispe + slib-graph-io + 0.9.5 + + + ch.qos.logback + logback-classic + + + org.slf4j + log4j-over-slf4j + + + org.slf4j + jcl-over-slf4j + + + org.slf4j + slf4j-log4j12 + + + + + + com.github.sharispe + slib-sml + 0.9.5 + + + ch.qos.logback + logback-classic + + + org.slf4j + log4j-over-slf4j + + + org.slf4j + jcl-over-slf4j + + + org.slf4j + slf4j-log4j12 + + + + + + com.github.sharispe + slib-utils + 0.9.5 + + + ch.qos.logback + logback-classic + + + org.slf4j + log4j-over-slf4j + + + org.slf4j + jcl-over-slf4j + + + org.slf4j + slf4j-log4j12 + + + + + + com.github.sharispe + slib-graph-model-impl + 0.9.5 + + + ch.qos.logback + logback-classic + + + org.slf4j + log4j-over-slf4j + + + org.slf4j + jcl-over-slf4j + + + org.slf4j + slf4j-log4j12 + + + + + com.github.sharispe + slib-indexer + 0.9.5 + + + + ch.qos.logback + logback-classic + + + org.slf4j + log4j-over-slf4j + + + org.slf4j + jcl-over-slf4j + + + org.slf4j + slf4j-log4j12 + + + + + + com.github.sharispe + slib-graph-algo + 0.9.5 + + + ch.qos.logback + logback-classic + + + org.slf4j + log4j-over-slf4j + + + org.slf4j + jcl-over-slf4j + + + org.slf4j + slf4j-log4j12 + + + + + + + org.slf4j + slf4j-api + 1.7.30 + + + + + + \ No newline at end of file diff --git a/slibAPI/src/main/java/de/unijena/cs/fusion/slib/SML.java b/slibAPI/src/main/java/de/unijena/cs/fusion/slib/SML.java new file mode 100644 index 0000000..a16735a --- /dev/null +++ b/slibAPI/src/main/java/de/unijena/cs/fusion/slib/SML.java @@ -0,0 +1,301 @@ + + +package de.unijena.cs.fusion.slib; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.openrdf.model.URI; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import slib.graph.io.conf.GDataConf; +import slib.graph.io.loader.GraphLoaderGeneric; +import slib.graph.io.util.GFormat; +import slib.graph.model.graph.G; +import slib.graph.model.impl.graph.memory.GraphMemory; +import slib.graph.model.impl.repo.URIFactoryMemory; +import slib.graph.model.repo.URIFactory; +import slib.sml.sm.core.engine.SM_Engine; +import slib.sml.sm.core.metrics.ic.utils.IC_Conf_Topo; +import slib.sml.sm.core.metrics.ic.utils.ICconf; +import slib.sml.sm.core.utils.SMConstants; +import slib.sml.sm.core.utils.SMconf; +import slib.utils.ex.SLIB_Ex_Critic; + + + +public class SML { + static Logger logger = LoggerFactory.getLogger(SML.class); + + static URIFactory factory; + + public ArrayList URIsInQuery; + + //adjust if needed - provide the path to your slibAPI/res/vocabs folder with all ontologies to be loaded + public static String FOLDER_WITH_GRAPHS = "C:/slibAPI/res/vocabs/"; + + + public static HashMap graphMap; + + public HashMap graphNodeLevelMap; + + public HashMap graphURINodeLevelMap; + + public HashMap getGraphURINodeLevelMap() { + return graphURINodeLevelMap; + } + + + public void setGraphURINodeLevelMap(HashMap graphURINodeLevelMap) { + this.graphURINodeLevelMap = graphURINodeLevelMap; + } + + + public ArrayList getURIsInQuery() { + return URIsInQuery; + } + + + public void setURIsInQuery(ArrayList uRIsInQuery) { + URIsInQuery = uRIsInQuery; + } + + + public HashMap getGraphMap() { + return graphMap; + } + + + public void setGraphMap(HashMap graphMap) { + this.graphMap = graphMap; + } + + + public HashMap getGraphNodeLevelMap() { + return graphNodeLevelMap; + } + + + public void setGraphNodeLevelMap(HashMap graphNodeLevelMap) { + this.graphNodeLevelMap = graphNodeLevelMap; + } + + + + public SML(){ + logger.info("Loading required Data"); + + factory = URIFactoryMemory.getSingleton(); + graphMap = new HashMap(); + graphNodeLevelMap = new HashMap(); + graphURINodeLevelMap = new HashMap(); + + + + + + //for all graphs in a folder - load graphs + List allGraphs = iterateOverFiles(new File(FOLDER_WITH_GRAPHS)); + + //get graph name + for(File g : allGraphs){ + + String[] pathArray = g.getPath().split("\\\\"); + String fileName = pathArray[pathArray.length-1]; + + if(fileName!=null){ + String[] fileNameA = fileName.split("\\."); + String vocabulary = fileNameA[0]; + System.out.println("graph: " + vocabulary); + + + + URI graph_uri = factory.getURI("http://"+vocabulary+"/"); + G graph = new GraphMemory(graph_uri); + GDataConf graphconf; + + if (g.getName().toLowerCase().endsWith("ttl")) { + graphconf = new GDataConf(GFormat.TURTLE, g.getPath()); + } + + else if(g.getName().toLowerCase().endsWith("obo")){ + graphconf = new GDataConf(GFormat.OBO, g.getPath()); + } + else{ + graphconf = new GDataConf(GFormat.RDF_XML, g.getPath()); + } + + try { + GraphLoaderGeneric.populate(graphconf, graph); + + // General information about the graph + SM_Engine engine = new SM_Engine(graph); + int countClasses = engine.getClasses().size(); + //System.out.println("CountClasses: "+countClasses); + + graphMap.put(vocabulary.toLowerCase(), engine); + + logger.info("Computing Node Level"); + Map URI_NL_Map = engine.computeNodeLevel(); + this.graphURINodeLevelMap.put(vocabulary, URI_NL_Map ); + logger.info("Node Level Computation done"); + + Map> nodeLevelMap = computeURIsPerNodeLevel(URI_NL_Map); + graphNodeLevelMap.put(vocabulary, nodeLevelMap); + + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + logger.error("Error " + e.getMessage()); + } + + + } + + } + + } + + + private Map> computeURIsPerNodeLevel(Map URI_NL_Map) { + + + Map> nodeLevelMap = new HashMap>(); + + if (URI_NL_Map!=null) { + for (Iterator it = URI_NL_Map.keySet().iterator(); it.hasNext();) { + URI uri = it.next(); + long nodeLevel = URI_NL_Map.get(uri); + + if(!nodeLevelMap.keySet().contains(nodeLevel)) { + Set URIs = new HashSet(); + URIs.add(uri); + nodeLevelMap.put(nodeLevel, URIs); + }else { + Set URIs = nodeLevelMap.get(nodeLevel); + + if(!URIs.contains(uri)) + URIs.add(uri); + + nodeLevelMap.put(nodeLevel, URIs); + } + } + } + return nodeLevelMap; + } + + + public static List iterateOverFiles(File folder) { + + List result = new ArrayList(); + + if (folder.isDirectory()) { + File[] allFiles = folder.listFiles(); + + for(File file : allFiles){ + File semFile = findFileswithSemanticExtension(file); + if(semFile != null) { + result.add(semFile); + } + } + + } else { + logger.error("file " + folder +" is not a directory"); + } + + + return result; + } + + public static File findFileswithSemanticExtension(File file) { + //System.out.println(file.getAbsolutePath()); + if(file.getName().toLowerCase().endsWith("owl") || file.getName().toLowerCase().endsWith("rdf")|| file.getName().toLowerCase().endsWith("ttl") || file.getName().toLowerCase().endsWith("obo") ||file.getName().toLowerCase().endsWith("nt")) { + return file; + } + else{ + logger.error("no semantic formats found! Please provide *.owl, *.rdf, *.obo or *.nt files"); + } + return null; + } + + + private static String getGraphName(String URI) { + String[] graphURIName = URI.split("/"); + String[] vocabNameArray = graphURIName[graphURIName.length-1].split("_"); + String vocab=vocabNameArray[0]; + + return vocab.toLowerCase(); + + } + + public double computeSemSim(String URIA, String URIB) throws SLIB_Ex_Critic { + URI nodeA = factory.getURI(URIA.toString()); + String vocab = getGraphName(URIA); + + URI nodeB = factory.getURI(URIB); + String vocabB = getGraphName(URIB); + + double sim = 0.0; + + + + SM_Engine engine = graphMap.get(vocab); + + + if(engine!=null){ + Set nodeA_Ancs = engine.getAncestorsInc(nodeA); + + + + // Retrieve the inclusive descendants of a vertex + Set nodeA_Descs = engine.getDescendantsInc(nodeA); + + + // First we define the information content (IC) we will use + //ICconf icConf = new IC_Conf_Topo("Sanchez", SMConstants.FLAG_ICI_SANCHEZ_2011); + ICconf icConf = new IC_Conf_Topo("Zhou", SMConstants.FLAG_ICI_ZHOU_2008); + //ICconf icConf = new IC_Conf_Topo("Harispe", SMConstants.FLAG_ICI_HARISPE_2012); + + // Then we define the Semantic measure configuration + //SMconf smConf = new SMconf("Lin", SMConstants.FLAG_SIM_PAIRWISE_DAG_NODE_LIN_1998); + SMconf smConf = new SMconf("Resnik", SMConstants.FLAG_SIM_PAIRWISE_DAG_NODE_RESNIK_1995); + //SMconf smConf = new SMconf("Harispe", SMConstants.FLAG_SIM_PAIRWISE_DAG_NODE_HARISPE_2013); //node-based pairwise + //SMconf smConf = new SMconf("Lin", SMConstants.FLAG_SIM_PAIRWISE_DAG_NODE_LIN_1998); + smConf.setICconf(icConf); + + + // Finally, we compute the similarity between the concepts Forest Biome and shrub + + + try{ + sim = engine.compare(smConf, nodeA,nodeB); + } + catch(Exception e){ + //the semantic similarity can not be computed for URIs that do not share the same root + logger.info("Semantic Similarity can not be computed for URIs: "+URIA + ", "+URIB); + logger.info(e.getMessage()); + + //e.printStackTrace(); + + sim = 0.0; + + } + //System.out.println("Sim " +URIA + ", " +URIB+" : "+sim); + + } + //} + //else - similarity is 0.0 if URIs are located in different graphs + return sim; + + +} +} diff --git a/slibAPI/src/main/resources/logback.xml b/slibAPI/src/main/resources/logback.xml new file mode 100644 index 0000000..267660d --- /dev/null +++ b/slibAPI/src/main/resources/logback.xml @@ -0,0 +1,37 @@ + + + + + logs/output.log + + + logs/output-%d{yyyy-MM-dd}.%i.log + + + 100MB + + + + + %d - %-5level %logger{36} - %msg %n + + false + + + + + + %d - %-5level %logger{36} - %msg %n + + + + + + + + + + + + +