Skip to content

Commit

Permalink
Fix passage building across more than two concatenated files (fixes #421
Browse files Browse the repository at this point in the history
)

The code previously only considered the case of two concatenated files
when reading from a concatenated source pointer. This has now been fixed
to allow an arbitrary number of files between the start and end offsets.
  • Loading branch information
jbaiter committed Apr 24, 2024
1 parent 54147b9 commit d1d8485
Show file tree
Hide file tree
Showing 10 changed files with 4,868 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ private IterableCharSequence getCharSeq(int offset) {
return it;
}

private int adjustOffset(int offset) {
/** Given an absolute offset, return the relative offset in the file the offset resides in. */
private int getFileRelativeOffset(int offset) {
return offset - offsetMap.floorKey(offset);
}

Expand Down Expand Up @@ -98,7 +99,7 @@ public char charAt(int offset) {
throw new IndexOutOfBoundsException();
}
IterableCharSequence seq = getCharSeq(offset);
int adjustedOffset = adjustOffset(offset);
int adjustedOffset = getFileRelativeOffset(offset);
return seq.charAt(adjustedOffset);
}

Expand All @@ -110,15 +111,19 @@ public CharSequence subSequence(int start, int end, boolean forceAscii) {
if (offsetMap.floorKey(start).equals(offsetMap.floorKey(end))) {
// Easy mode, start and end are in the same file
IterableCharSequence seq = getCharSeq(start);
return seq.subSequence(adjustOffset(start), adjustOffset(end), forceAscii);
return seq.subSequence(getFileRelativeOffset(start), getFileRelativeOffset(end), forceAscii);
} else {
IterableCharSequence seq = getCharSeq(start);
int adjustedStart = adjustOffset(start);
StringBuilder sb =
new StringBuilder(seq.subSequence(adjustedStart, seq.length(), forceAscii));
seq = getCharSeq(end);
int adjustedEnd = adjustOffset(end);
sb.append(seq.subSequence(0, adjustedEnd, forceAscii));
// Hard mode, start and end span multiple files
final int targetLength = end - start;
StringBuilder sb = new StringBuilder(targetLength);
for (int charsRead = 0; charsRead < targetLength; ) {
IterableCharSequence seq = getCharSeq(start + charsRead);
int fileOffsetStart = getFileRelativeOffset(start + charsRead);
int fileOffsetEnd = Math.min(seq.length(), fileOffsetStart + (targetLength - charsRead));

sb.append(seq.subSequence(fileOffsetStart, fileOffsetEnd, forceAscii));
charsRead += fileOffsetEnd - fileOffsetStart;
}
return sb.toString();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,7 @@ protected String getHighlightedFragment(Passage passage, IterableCharSequence co
mergeMatches(passage.getNumMatches(), passage.getMatchStarts(), passage.getMatchEnds());
for (PassageMatch match : matches) {
// Can't just do match.start - passage.getStartOffset(), since both offsets are relative to
// **UTF-8 bytes**, but
// we need **UTF-16 codepoint** offsets in the code.
// **UTF-8 bytes**, but we need **UTF-16 codepoint** offsets in the code.
String preMatchContent =
content.subSequence(passage.getStartOffset(), match.start).toString();
int matchStart = preMatchContent.length();
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/com/github/dbmdz/solrocr/solr/AltoMultiTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ public static void beforeClass() throws Exception {
+ "[2223:25803,25808:32247,32252:38770,38775:85408,85413:88087,88092:120911,120916:149458,"
+ "149463:178686,178691:220893,220898:231618,231623:242459]";
assertU(adoc("ocr_text", debugPtr, "id", "84"));

ocrBasePath = Paths.get("src/test/resources/data/alto_concat_421");
ptr =
Files.list(ocrBasePath)
.filter(p -> p.getFileName().toString().endsWith(".xml"))
.map(p -> p.toAbsolutePath().toString())
.sorted()
.collect(Collectors.joining("+"));
assertU(adoc("ocr_text", ptr, "id", "126"));
assertU(commit());
}

Expand Down Expand Up @@ -127,4 +136,14 @@ public void testRegionsWithHyphenation() {
"contains(//arr[@name='snippets']/lst/str[@name='text']/text(), '<em>kalifat</em>')",
"count(//arr[@name='highlights']/arr/lst)=2");
}

@Test
public void testLargeConcatenatedOcr() {
SolrQueryRequest req = xmlQ("q", "china", "fq", "id:126");
assertQ(
req,
"count(//arr[@name='snippets']/lst)=7",
"count(//arr[@name='highlights']/arr/lst)=7",
"//arr[@name='snippets']/lst[1]/str[@name='text']/text()='die Annahme, die Sogder seien aus Wohnsitzen in Ostturkistan ins obere Industal gezogen, um dort wichtige Positionen im Handel zwischen <em>China</em> und Indien einnehmen zu können. Dieser 45'");
}
}
698 changes: 698 additions & 0 deletions src/test/resources/data/alto_concat_421/007.xml

Large diffs are not rendered by default.

779 changes: 779 additions & 0 deletions src/test/resources/data/alto_concat_421/009.xml

Large diffs are not rendered by default.

662 changes: 662 additions & 0 deletions src/test/resources/data/alto_concat_421/017.xml

Large diffs are not rendered by default.

638 changes: 638 additions & 0 deletions src/test/resources/data/alto_concat_421/019.xml

Large diffs are not rendered by default.

695 changes: 695 additions & 0 deletions src/test/resources/data/alto_concat_421/020.xml

Large diffs are not rendered by default.

523 changes: 523 additions & 0 deletions src/test/resources/data/alto_concat_421/046.xml

Large diffs are not rendered by default.

838 changes: 838 additions & 0 deletions src/test/resources/data/alto_concat_421/052.xml

Large diffs are not rendered by default.

0 comments on commit d1d8485

Please sign in to comment.