diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index ee63b539cc..0bd8af981e 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -1265,6 +1265,13 @@ public static List characterPositionsUrlPatternWithPdfAnnotation int startTokenIndex = tokenPositions.start; int endTokensIndex = tokenPositions.end; + // There are no token that matches the character offsets, this may happen rarely when + // the character offset falls in the middle of a token, this is likely due to a badly + // constructed PDF document + if (startTokenIndex < 0 || endTokensIndex < 0) { + continue; + } + List urlTokens = new ArrayList<>(layoutTokens.subList(startTokenIndex, endTokensIndex+1)); String urlString = LayoutTokensUtil.toText(urlTokens); @@ -1360,6 +1367,14 @@ public static List characterPositionsUrlPatternWithPdfAnnotation String difference = urlString.substring(startCharDifference); OffsetPosition newTokenPositions = getTokenPositions(startCharDifference, urlString.length(), urlTokens); + if (newTokenPositions.end < 0) { + // The difference is within the last token, even if we split the layout tokens, here, + // it won't solve the problem so we limit collateral damage. + // At some point we could return the destination containing the clean URL to fill up the + // "target" attribute in the TEI + newTokenPositions.end = urlTokens.size() - 1; + } + urlTokens = urlTokens.subList(0, newTokenPositions.end); endPos = startPos + LayoutTokensUtil.toText(urlTokens).length(); } else { diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index b957d12b90..11d11cfcda 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -435,5 +435,22 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC assertThat(input.substring(url0.start, url0.end), is("https://uhslc.soest.hawaii.edu/stations/?stn=057#levels")); } + @Test + public void testGetTokenPosition() throws Exception { + + //NOTE LF: The current behaviour will return -1 if the tokens are not matching with the positions + // of the characters + //Here the url is https://paperpile.com/c/QlNkzH/Hj7c+4D5e but because `Lameness` is attached the last token + // is `Hj7c+4D5eLameness` which will cause troubles. + + String input = "https://paperpile.com/c/QlNkzH/Hj7c+4D5eLameness"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + OffsetPosition tokenPositions = Lexicon.getTokenPositions(40, 48, tokens); + + assertThat(tokenPositions.start, is(-1)); + assertThat(tokenPositions.end, is(-1)); + + } }