Skip to content

Commit

Permalink
#4772 - Repair to trim whitespace off annotations
Browse files Browse the repository at this point in the history
- Added repair
- No longer force-enable server-side timings in dev mode
- Show progress of checks and repairs in task tray
  • Loading branch information
reckart committed Apr 29, 2024
1 parent 9db2487 commit d56df2f
Show file tree
Hide file tree
Showing 14 changed files with 862 additions and 232 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ private static void init(SpringApplicationBuilder aBuilder)
if (Boolean.getBoolean("inception.dev")) {
System.setProperty("wicket.core.settings.debug.enabled", "true");
System.setProperty("wicket.core.settings.general.configuration-type", "development");
System.setProperty("debug.sendServerSideTimings", "true");
System.setProperty("webanno.debug.enforce_cas_thread_lock", "true");
aBuilder.profiles(DeploymentModeService.PROFILE_DEVELOPMENT_MODE);
}
Expand Down
4 changes: 4 additions & 0 deletions inception/inception-diag/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
</dependency>

<dependency>
<groupId>org.apache.uima</groupId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.clarin.webanno.diag.checks;

import static java.lang.String.join;
import static org.apache.commons.lang3.StringUtils.abbreviateMiddle;
import static org.apache.commons.text.StringEscapeUtils.escapeJava;
import static org.apache.uima.fit.util.CasUtil.getType;
import static org.apache.uima.fit.util.CasUtil.select;
import static org.springframework.util.CollectionUtils.isEmpty;

import java.util.ArrayList;
import java.util.List;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;

import de.tudarmstadt.ukp.clarin.webanno.model.Project;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils;
import de.tudarmstadt.ukp.inception.schema.api.AnnotationSchemaService;
import de.tudarmstadt.ukp.inception.support.logging.LogMessage;

public class AllAnnotationsStartAndEndWithCharactersCheck
implements Check
{
private final AnnotationSchemaService annotationService;

public AllAnnotationsStartAndEndWithCharactersCheck(AnnotationSchemaService aAnnotationService)
{
annotationService = aAnnotationService;
}

@Override
public boolean check(Project aProject, CAS aCas, List<LogMessage> aMessages)
{
if (annotationService == null) {
return true;
}

var allAnnoLayers = annotationService.listAnnotationLayer(aProject);
if (isEmpty(allAnnoLayers)) {
return true;
}

boolean ok = true;
for (var layer : allAnnoLayers) {
Type type;
try {
type = getType(aCas, layer.getName());
}
catch (IllegalArgumentException e) {
// If the type does not exist, the CAS has not been upgraded. In this case, we
// can skip checking the layer because there will be no annotations anyway.
continue;
}

if (!aCas.getTypeSystem().subsumes(aCas.getAnnotationType(), type)) {
// Skip non-annotation types
continue;
}

var docText = aCas.getDocumentText();
for (var ann : select(aCas, type)) {
var offsets = new int[] { ann.getBegin(), ann.getEnd() };
TrimUtils.trim(docText, offsets);

boolean startsWithWhitespace = offsets[0] != ann.getBegin();
boolean endsWithWhitespace = offsets[1] != ann.getEnd();
if (!startsWithWhitespace && !endsWithWhitespace) {
continue;
}

var locations = new ArrayList<String>();
if (startsWithWhitespace) {
locations.add("starts");
}
if (endsWithWhitespace) {
locations.add("ends");
}

aMessages.add(LogMessage.error(this, "[%s] [%s]@[%d-%d] %s with whitespace",
ann.getType().getName(),
escapeJava(abbreviateMiddle(ann.getCoveredText(), "…", 20)), ann.getBegin(),
ann.getEnd(), join(" and ", locations)));

ok = false;
}
}

return ok;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import de.tudarmstadt.ukp.clarin.webanno.diag.ChecksRegistryImpl;
import de.tudarmstadt.ukp.clarin.webanno.diag.RepairsRegistry;
import de.tudarmstadt.ukp.clarin.webanno.diag.RepairsRegistryImpl;
import de.tudarmstadt.ukp.clarin.webanno.diag.checks.AllAnnotationsStartAndEndWithCharactersCheck;
import de.tudarmstadt.ukp.clarin.webanno.diag.checks.AllAnnotationsStartAndEndWithinSentencesCheck;
import de.tudarmstadt.ukp.clarin.webanno.diag.checks.AllFeatureStructuresIndexedCheck;
import de.tudarmstadt.ukp.clarin.webanno.diag.checks.CASMetadataTypeIsPresentCheck;
Expand All @@ -55,6 +56,7 @@
import de.tudarmstadt.ukp.clarin.webanno.diag.repairs.RemoveZeroSizeTokensAndSentencesRepair;
import de.tudarmstadt.ukp.clarin.webanno.diag.repairs.Repair;
import de.tudarmstadt.ukp.clarin.webanno.diag.repairs.SwitchBeginAndEndOnNegativeSizedAnnotationsRepair;
import de.tudarmstadt.ukp.clarin.webanno.diag.repairs.TrimAnnotationsRepair;
import de.tudarmstadt.ukp.clarin.webanno.diag.repairs.UpgradeCasRepair;
import de.tudarmstadt.ukp.inception.schema.api.AnnotationSchemaService;

Expand Down Expand Up @@ -236,4 +238,17 @@ public UnreachableAnnotationsCheck unreachableAnnotationsCheck()
{
return new UnreachableAnnotationsCheck();
}

@Bean
public AllAnnotationsStartAndEndWithCharactersCheck allAnnotationsStartAndEndWithCharactersCheck(
AnnotationSchemaService aAnnotationService)
{
return new AllAnnotationsStartAndEndWithCharactersCheck(aAnnotationService);
}

@Bean
public TrimAnnotationsRepair trimAnnotationsRepair(AnnotationSchemaService aAnnotationService)
{
return new TrimAnnotationsRepair(aAnnotationService);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.clarin.webanno.diag.repairs;

import static java.lang.String.join;
import static org.apache.commons.lang3.StringUtils.abbreviateMiddle;
import static org.apache.commons.text.StringEscapeUtils.escapeJava;
import static org.apache.uima.fit.util.CasUtil.getType;
import static org.apache.uima.fit.util.CasUtil.select;
import static org.springframework.util.CollectionUtils.isEmpty;

import java.util.ArrayList;
import java.util.List;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.jcas.tcas.Annotation;

import de.tudarmstadt.ukp.clarin.webanno.model.Project;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils;
import de.tudarmstadt.ukp.inception.schema.api.AnnotationSchemaService;
import de.tudarmstadt.ukp.inception.support.logging.LogMessage;

public class TrimAnnotationsRepair
implements Repair
{
private final AnnotationSchemaService annotationService;

public TrimAnnotationsRepair(AnnotationSchemaService aAnnotationService)
{
annotationService = aAnnotationService;
}

@Override
public void repair(Project aProject, CAS aCas, List<LogMessage> aMessages)
{
var allAnnoLayers = annotationService.listAnnotationLayer(aProject);
if (isEmpty(allAnnoLayers)) {
return;
}

for (var layer : allAnnoLayers) {
Type type;
try {
type = getType(aCas, layer.getName());
}
catch (IllegalArgumentException e) {
// If the type does not exist, the CAS has not been upgraded. In this case, we
// can skip checking the layer because there will be no annotations anyway.
continue;
}

if (!aCas.getTypeSystem().subsumes(aCas.getAnnotationType(), type)) {
// Skip non-annotation types
continue;
}

var docText = aCas.getDocumentText();
for (var ann : select(aCas, type)) {
var oldBegin = ann.getBegin();
var oldEnd = ann.getEnd();

TrimUtils.trim(docText, (Annotation) ann);

boolean beginChanged = oldBegin != ann.getBegin();
boolean endChanged = oldEnd != ann.getEnd();
if (!beginChanged && !endChanged) {
continue;
}

var locations = new ArrayList<String>();
if (beginChanged) {
locations.add("start");
}
if (endChanged) {
locations.add("end");
}

aMessages.add(LogMessage.info(this, "Trimmed whitespace of [%s] [%s]@[%d-%d] at %s",
ann.getType().getName(),
escapeJava(abbreviateMiddle(ann.getCoveredText(), "…", 20)), ann.getBegin(),
ann.getEnd(), join(" and ", locations)));
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,16 @@ Checks if there are any unreachable feature structures. Such feature structures
they are not regularly accessible. Such feature structures may be created as a result of bugs.
Removing them is harmless and reduces memory and disk space usage.

[[check_AllAnnotationsStartAndEndWithCharactersCheck]]
=== All annotations start and end with characters
[horizontal]
ID:: `check_AllAnnotationsStartAndEndWithCharactersCheck`
Related repairs:: <<repair_TrimAnnotationsRepair>>

Checks if all annotations start and end with a character (i.e. not a whitespace). Annotations that start or end with a
whitespace character can cause problems during rendering. Trimming whitespace at the begin and end is typically as
harmless procedure.


[[sect_repairs]]
== Repairs
Expand Down Expand Up @@ -369,3 +379,12 @@ ID:: `CoverAllTextInSentencesRepair`
This repair checks if there is any text not covered by sentences. If there is, it creates a new
sentence annotation on this text starting at the end of the last sentence before it (or the start
of the document text) and the begin of the next sentence (or the end of the document text).

[[repair_TrimAnnotationsRepair]]
=== Trim annotations

[horizontal]
ID:: `TrimAnnotationsRepair`

This repair adjusts annotation boundaries such that they do not include any whitespace at the beginning or end of the
annotation.
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ public void runSync()
}
}

public abstract void execute();
public abstract void execute() throws Exception;

@Override
public String toString()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,7 @@ protected void init()
private void installTimingListener()
{
var settings = SettingsUtil.getSettings();
if (!DEVELOPMENT.equals(getConfigurationType())
&& !"true".equalsIgnoreCase(settings.getProperty("debug.sendServerSideTimings"))) {
if (!"true".equalsIgnoreCase(settings.getProperty("debug.sendServerSideTimings"))) {
return;
}

Expand Down
4 changes: 4 additions & 0 deletions inception/inception-ui-project/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@
<groupId>de.tudarmstadt.ukp.inception.app</groupId>
<artifactId>inception-support-bootstrap</artifactId>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.inception.app</groupId>
<artifactId>inception-scheduling</artifactId>
</dependency>

<!-- UIMA dependencies -->

Expand Down
Loading

0 comments on commit d56df2f

Please sign in to comment.