From 87030d19532fb04ab9be08aab5d5fe317d619401 Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Wed, 20 Nov 2024 18:07:44 +0100 Subject: [PATCH 1/2] Implemented GreedyStringTiling workaround --- .../src/main/java/de/jplag/GreedyStringTiling.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/de/jplag/GreedyStringTiling.java b/core/src/main/java/de/jplag/GreedyStringTiling.java index 3cef93354..e08450584 100644 --- a/core/src/main/java/de/jplag/GreedyStringTiling.java +++ b/core/src/main/java/de/jplag/GreedyStringTiling.java @@ -30,6 +30,8 @@ public class GreedyStringTiling { private final Map cachedTokenValueLists = new IdentityHashMap<>(); private final Map cachedHashLookupTables = new IdentityHashMap<>(); + private static final String ERROR_INDEX_OUT_OF_BOUNDS = "GST index out of bounds. This is probably a random issue caused by multithreading issues. Length: %s, Index: %s"; + public GreedyStringTiling(JPlagOptions options) { this.options = options; // Ensures 1 <= neighborLength <= minimumTokenMatch @@ -115,14 +117,14 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri List iterationMatches = new ArrayList<>(); for (int leftStartIndex = 0; leftStartIndex < leftValues.length - maximumMatchLength; leftStartIndex++) { int leftSubsequenceHash = leftLookupTable.subsequenceHashForStartIndex(leftStartIndex); - if (leftMarked[leftStartIndex] || leftSubsequenceHash == SubsequenceHashLookupTable.NO_HASH) { + if (checkMark(leftMarked, leftStartIndex) || leftSubsequenceHash == SubsequenceHashLookupTable.NO_HASH) { continue; } List possiblyMatchingRightStartIndexes = rightLookupTable .startIndexesOfPossiblyMatchingSubsequencesForSubsequenceHash(leftSubsequenceHash); for (Integer rightStartIndex : possiblyMatchingRightStartIndexes) { // comparison uses >= because it is assumed that the last token is a pivot (FILE_END) - if (rightMarked[rightStartIndex] || maximumMatchLength >= rightValues.length - rightStartIndex) { + if (checkMark(rightMarked, rightStartIndex) || maximumMatchLength >= rightValues.length - rightStartIndex) { continue; } @@ -228,4 +230,12 @@ private int[] tokenValueListFromSubmission(Submission submission) { return tokenValueList; })); } + + private boolean checkMark(boolean[] marks, int index) { + if (index >= marks.length) { + throw new IllegalStateException(String.format(ERROR_INDEX_OUT_OF_BOUNDS, marks.length, index)); + } + + return marks[index]; + } } From 217f8b83b8b5d2b4ad5b2a207532fd7e2e5db33e Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Thu, 28 Nov 2024 11:40:15 +0100 Subject: [PATCH 2/2] Expanded error message for GreedyStringTiling out of bounds --- .../main/java/de/jplag/GreedyStringTiling.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/de/jplag/GreedyStringTiling.java b/core/src/main/java/de/jplag/GreedyStringTiling.java index e08450584..9bbe28833 100644 --- a/core/src/main/java/de/jplag/GreedyStringTiling.java +++ b/core/src/main/java/de/jplag/GreedyStringTiling.java @@ -9,6 +9,7 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.stream.Collectors; import de.jplag.options.JPlagOptions; @@ -30,7 +31,12 @@ public class GreedyStringTiling { private final Map cachedTokenValueLists = new IdentityHashMap<>(); private final Map cachedHashLookupTables = new IdentityHashMap<>(); - private static final String ERROR_INDEX_OUT_OF_BOUNDS = "GST index out of bounds. This is probably a random issue caused by multithreading issues. Length: %s, Index: %s"; + private static final String ERROR_INDEX_OUT_OF_BOUNDS = """ + GST index out of bounds. This is probably a random issue caused by multithreading issues. + Length: %s, Index: %s + TokenCount: %s, TokenList: %s + CachedTokenCount: %s + """.trim().stripIndent(); public GreedyStringTiling(JPlagOptions options) { this.options = options; @@ -117,14 +123,14 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri List iterationMatches = new ArrayList<>(); for (int leftStartIndex = 0; leftStartIndex < leftValues.length - maximumMatchLength; leftStartIndex++) { int leftSubsequenceHash = leftLookupTable.subsequenceHashForStartIndex(leftStartIndex); - if (checkMark(leftMarked, leftStartIndex) || leftSubsequenceHash == SubsequenceHashLookupTable.NO_HASH) { + if (checkMark(leftMarked, leftStartIndex, leftSubmission) || leftSubsequenceHash == SubsequenceHashLookupTable.NO_HASH) { continue; } List possiblyMatchingRightStartIndexes = rightLookupTable .startIndexesOfPossiblyMatchingSubsequencesForSubsequenceHash(leftSubsequenceHash); for (Integer rightStartIndex : possiblyMatchingRightStartIndexes) { // comparison uses >= because it is assumed that the last token is a pivot (FILE_END) - if (checkMark(rightMarked, rightStartIndex) || maximumMatchLength >= rightValues.length - rightStartIndex) { + if (checkMark(rightMarked, rightStartIndex, rightSubmission) || maximumMatchLength >= rightValues.length - rightStartIndex) { continue; } @@ -231,9 +237,11 @@ private int[] tokenValueListFromSubmission(Submission submission) { })); } - private boolean checkMark(boolean[] marks, int index) { + private boolean checkMark(boolean[] marks, int index, Submission submission) { if (index >= marks.length) { - throw new IllegalStateException(String.format(ERROR_INDEX_OUT_OF_BOUNDS, marks.length, index)); + throw new IllegalStateException(String.format(ERROR_INDEX_OUT_OF_BOUNDS, marks.length, index, submission.getTokenList().size(), + submission.getTokenList().stream().map(it -> it.getType().getDescription()).collect(Collectors.joining(", ")), + cachedTokenValueLists.get(submission).length)); } return marks[index];