Skip to content

Commit

Permalink
Merge branch 'release/30.x'
Browse files Browse the repository at this point in the history
* release/30.x:
  #4410 - OpenNLP multi-token sequence classifier does not learn from annotations without labels
  #4412 - PMC search returns some results that cannot be imported
  • Loading branch information
reckart committed Dec 28, 2023
2 parents 0839ec0 + d4b0399 commit 07517e4
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public List<ExternalSearchResult> executeQuery(DocumentRepository aDocumentRepos
PubMedProviderTraits aTraits, String aQuery)
{
var date = Instant.now().atZone(ZoneOffset.UTC).minus(Duration.ofHours(24));
var query = aQuery + " AND \"free full text\"[filter] AND (\"0001/01/01\"[PubDate] : \""
var query = aQuery + " AND \"open access\"[filter] AND (\"0001/01/01\"[PubDate] : \""
+ date.get(YEAR) + "/" + date.get(MONTH_OF_YEAR) + "/" + date.get(DAY_OF_MONTH)
+ "\"[PubDate])";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import static de.tudarmstadt.ukp.inception.support.json.JSONUtil.toJsonString;

import java.io.IOException;
import java.lang.invoke.MethodHandles;

import org.apache.wicket.markup.html.panel.Panel;
import org.apache.wicket.model.IModel;
Expand Down Expand Up @@ -50,7 +51,7 @@
public class PubMedCentralProviderFactory
implements BeanNameAware, ExternalSearchProviderFactory<PubMedProviderTraits>
{
private final Logger log = LoggerFactory.getLogger(getClass());
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

private final EntrezClient entrezClient;
private final PmcOaClient pmcOaClient;
Expand Down Expand Up @@ -81,7 +82,7 @@ public String getBeanName()
@Override
public String getDisplayName()
{
return "PubMed Central (experimental)";
return "PubMed Central Open Access (experimental)";
}

@Override
Expand All @@ -105,7 +106,7 @@ public PubMedProviderTraits readTraits(DocumentRepository aDocumentRepository)
aDocumentRepository.getProperties());
}
catch (IOException e) {
log.error("Error while reading traits", e);
LOG.error("Error while reading traits", e);
}

if (traits == null) {
Expand All @@ -124,7 +125,7 @@ public void writeTraits(DocumentRepository aDocumentRepository,
aDocumentRepository.setProperties(json);
}
catch (IOException e) {
log.error("Error while writing traits", e);
LOG.error("Error while writing traits", e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
<html xmlns="http://www.w3.org/1999/xhtml"
xmlns:wicket="http://wicket.apache.org/dtds.data/wicket-xhtml1.4-strict.dtd">
<wicket:panel>
<div class="offset-sm-3 d-flex text-muted" role="alert">
<i class="fas fa-info-circle mr-2 mt-1"></i>
<p class="mb-0 mx-2 small">
<wicket:message key="openAccessHint"/>
</p>
</div>
<form wicket:id="form">
</form>
</wicket:panel>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
openAccessHint=#(product.name) uses the BioC version of the PMC documents for import. This is only available for \
Open Access texts. #(product.name) automatically adds a filter for open access results \
(<code>"open access"[filter]</code>) to the query. The BioC version of these texts may be available only with a \
delay #(product.name) automatically excludes results that were published in the last 24h to try to keep the \
number of non-importable results low. If you are still unable to import a result, try a bit later.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ support for the BioC format used by this repository connector.
link:https://www.ncbi.nlm.nih.gov/pmc/[PubMed Central]® (PMC) is a free full-text archive of biomedical and life sciences journal literature at the U.S. National Institutes of Health's National Library of Medicine (NIH/NLM). It can be added as an external document repository by
selecting the **PubMed Central** repository type.

NOTE: {product-name} uses the BioC version of the PMC documents for import. The search tries to
consider only documents that have full text available, but the BioC version of these texts may be
available only with a delay. Thus, if you cannot import a recently uploaded document from PMC into
{product-name}, you may try it again a day later and have more success.
NOTE: {product-name} uses the BioC version of the PMC documents for import. This is only available for
Open Access texts. {product-name} automatically adds a filter for open access results (`"open access"[filter]`)
to the query. The BioC version of these texts may be available only with a delay {product-name} automatically
excludes results that were published in the last 24h to try to keep the number of non-importable results low.
If you are still unable to import a result, try a bit later.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import static de.tudarmstadt.ukp.inception.rendering.model.Range.rangeCoveringAnnotations;
import static de.tudarmstadt.ukp.inception.support.uima.WebAnnoCasUtil.selectOverlapping;
import static org.apache.commons.lang3.StringUtils.isBlank;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
import static org.apache.uima.fit.util.CasUtil.getType;
import static org.apache.uima.fit.util.CasUtil.selectCovered;

Expand Down Expand Up @@ -153,12 +152,14 @@ public Range predict(PredictionContext aContext, CAS aCas, int aBegin, int aEnd)

for (var prediction : finder.find(tokens)) {
var label = prediction.getType();
if (NameSample.DEFAULT_TYPE.equals(label)) {
continue;
if (NameSample.DEFAULT_TYPE.equals(label) || BLANK_LABEL.equals(label)) {
label = null;
}

int begin = tokenAnnotations.get(prediction.getStart()).getBegin();
int end = tokenAnnotations.get(prediction.getEnd() - 1).getEnd();
var annotation = aCas.createAnnotation(predictedType, begin, end);

annotation.setStringValue(predictedFeature, label);
if (scoreFeature != null) {
annotation.setDoubleValue(scoreFeature, prediction.getProb());
Expand Down Expand Up @@ -221,7 +222,7 @@ public EvaluationResult evaluate(List<CAS> aCasses, DataSplitter aDataSplitter)
return result;
}

LOG.info("Training on [{}] sentences, predicting on [{}] of total [{}]", trainingSet.size(),
LOG.info("Training on [{}] samples, predicting on [{}] of total [{}]", trainingSet.size(),
testSet.size(), data.size());

// Train model
Expand All @@ -238,11 +239,11 @@ public EvaluationResult evaluate(List<CAS> aCasses, DataSplitter aDataSplitter)
nameFinder.clearAdaptiveData();

// Span contains one NE, Array of them all in one sentence
var sentence = sample.getSentence();
var predictedNames = nameFinder.find(sentence);
var sampleTokens = sample.getSentence();
var predictedNames = nameFinder.find(sampleTokens);
var goldNames = sample.getNames();

labelPairs.addAll(determineLabelsForASentence(sentence, predictedNames, goldNames));
labelPairs.addAll(determineLabelsForASentence(sampleTokens, predictedNames, goldNames));
}

return labelPairs.stream().collect(toEvaluationResult(DATAPOINT_UNIT.getSimpleName(),
Expand All @@ -268,7 +269,7 @@ private List<LabelPair> determineLabelsForASentence(String[] sentence, Span[] pr

var predictedLabel = NO_NE_TAG;
if (predictedNameIdx < predictedNames.length) {
Span predictedName = predictedNames[predictedNameIdx];
var predictedName = predictedNames[predictedNameIdx];
predictedLabel = determineLabel(predictedName, i);

if (i > predictedName.getEnd()) {
Expand Down Expand Up @@ -310,7 +311,7 @@ private List<NameSample> extractNameSamples(Iterable<CAS> aCasses)
{
var nameSamples = new ArrayList<NameSample>();

nextCas: for (CAS cas : aCasses) {
nextCas: for (var cas : aCasses) {
var sampleUnitType = getType(cas, SAMPLE_UNIT);
var tokenType = getType(cas, Token.class);

Expand Down Expand Up @@ -344,6 +345,10 @@ private List<NameSample> extractNameSamples(Iterable<CAS> aCasses)
private Span[] extractAnnotatedSpans(CAS aCas, AnnotationFS aSampleUnit,
Collection<? extends AnnotationFS> aTokens)
{
if (aTokens.isEmpty()) {
return new Span[0];
}

// Create spans from target annotations
var annotationType = getType(aCas, layerName);
var feature = annotationType.getFeatureByBaseName(featureName);
Expand All @@ -358,19 +363,22 @@ private Span[] extractAnnotatedSpans(CAS aCas, AnnotationFS aSampleUnit,
var idxTokenEndOffset = new Int2ObjectOpenHashMap<AnnotationFS>();
var idxToken = new Object2IntOpenHashMap<AnnotationFS>();
var idx = 0;
for (AnnotationFS t : aTokens) {
idxTokenBeginOffset.put(t.getBegin(), t);
idxTokenEndOffset.put(t.getEnd(), t);
idxToken.put(t, idx);
for (var token : aTokens) {
idxTokenBeginOffset.put(token.getBegin(), token);
idxTokenEndOffset.put(token.getEnd(), token);
idxToken.put(token, idx);
idx++;
}

var result = new ArrayList<Span>();
var highestEndTokenPositionObserved = 0;
var highestEndTokenPositionObserved = -1;
var numberOfAnnotations = annotations.size();
for (int i = 0; i < numberOfAnnotations; i++) {
var annotation = annotations.get(i);
var label = annotation.getFeatureValueAsString(feature);
if (isBlank(label)) {
label = BLANK_LABEL;
}

var beginToken = idxTokenBeginOffset.get(annotation.getBegin());
var endToken = idxTokenEndOffset.get(annotation.getEnd());
Expand All @@ -391,10 +399,8 @@ private Span[] extractAnnotatedSpans(CAS aCas, AnnotationFS aSampleUnit,
continue;
}

if (isNotBlank(label)) {
result.add(new Span(begin, end + 1, label));
highestEndTokenPositionObserved = end + 1;
}
result.add(new Span(begin, end + 1, label));
highestEndTokenPositionObserved = end + 1;
}

return result.toArray(new Span[result.size()]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@
import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.inception.annotation.storage.CasStorageSession;
import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.DataSplitter;
import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.EvaluationResult;
import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.IncrementalSplitter;
import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.PercentageBasedSplitter;
import de.tudarmstadt.ukp.inception.recommendation.api.model.Recommender;
Expand Down Expand Up @@ -114,16 +112,16 @@ public void thatPredictionWorks() throws Exception
@Test
public void thatEvaluationWorks() throws Exception
{
DataSplitter splitStrategy = new PercentageBasedSplitter(0.8, 10);
OpenNlpNerRecommender sut = new OpenNlpNerRecommender(recommender, traits);
List<CAS> casList = loadDevelopmentData();
var splitStrategy = new PercentageBasedSplitter(0.8, 10);
var sut = new OpenNlpNerRecommender(recommender, traits);
var casList = loadDevelopmentData();

EvaluationResult result = sut.evaluate(casList, splitStrategy);
var result = sut.evaluate(casList, splitStrategy);

double fscore = result.computeF1Score();
double accuracy = result.computeAccuracyScore();
double precision = result.computePrecisionScore();
double recall = result.computeRecallScore();
var fscore = result.computeF1Score();
var accuracy = result.computeAccuracyScore();
var precision = result.computePrecisionScore();
var recall = result.computeRecallScore();

System.out.printf("F1-Score: %f%n", fscore);
System.out.printf("Accuracy: %f%n", accuracy);
Expand All @@ -136,6 +134,34 @@ public void thatEvaluationWorks() throws Exception
assertThat(accuracy).isStrictlyBetween(0.0, 1.0);
}

@Test
public void thatEvaluationWorksNoLabels() throws Exception
{
var splitStrategy = new PercentageBasedSplitter(0.8, 10);
var sut = new OpenNlpNerRecommender(recommender, traits);
var casList = loadDevelopmentData();
for (var cas : casList) {
cas.select(NamedEntity.class).forEach(ne -> ne.setValue(null));
}

var result = sut.evaluate(casList, splitStrategy);

var fscore = result.computeF1Score();
var accuracy = result.computeAccuracyScore();
var precision = result.computePrecisionScore();
var recall = result.computeRecallScore();

System.out.printf("F1-Score: %f%n", fscore);
System.out.printf("Accuracy: %f%n", accuracy);
System.out.printf("Precision: %f%n", precision);
System.out.printf("Recall: %f%n", recall);

assertThat(fscore).isBetween(0.0, 1.0);
assertThat(precision).isBetween(0.0, 1.0);
assertThat(recall).isBetween(0.0, 1.0);
assertThat(accuracy).isBetween(0.0, 1.0);
}

@Test
public void thatIncrementalNerEvaluationWorks() throws Exception
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@
* limitations under the License.
*/
function bootstrapFeedbackPanelCloseAll() {
$('.alert').each((i, e) => {
$('.feedbackPanel .alert').each((i, e) => {
console.log("Closing", e);
bootstrap.Alert.getOrCreateInstance(e).close();
});
}

function bootstrapFeedbackPanelFade() {
setTimeout(function() {
$(".alert.alert-success").fadeTo(600, 0, function(){
$(".feedbackPanel .alert.alert-success").fadeTo(600, 0, function(){
$(".alert.alert-success").each((i, e) => bootstrap.Alert.getOrCreateInstance(e).close());
});
$(".alert.alert-info").fadeTo(600, 0, function(){
$(".feedbackPanel .alert.alert-info").fadeTo(600, 0, function(){
$(".alert.alert-info").each((i, e) => bootstrap.Alert.getOrCreateInstance(e).close());
});
}, 2000);
Expand Down

0 comments on commit 07517e4

Please sign in to comment.