Skip to content

Commit

Permalink
hunspell (minor): reduce allocations when reading the dictionary's mo…
Browse files Browse the repository at this point in the history
…rphological data (apache#12323)

there can be many entries with morph data, so we'd better avoid compiling and matching regexes and even stream allocation
  • Loading branch information
donnerpeter authored Jun 1, 2023
1 parent ac8c187 commit 4bf1b94
Showing 1 changed file with 17 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -992,7 +992,7 @@ private int mergeDictionaries(
// if we haven't seen any custom morphological data, try to parse one
if (!hasCustomMorphData) {
int morphStart = line.indexOf(MORPH_SEPARATOR);
if (morphStart >= 0 && morphStart < line.length()) {
if (morphStart >= 0) {
String data = line.substring(morphStart + 1);
hasCustomMorphData =
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
Expand Down Expand Up @@ -1321,14 +1321,22 @@ private List<String> splitMorphData(String morphData) {
if (morphData.isBlank()) {
return Collections.emptyList();
}
return Arrays.stream(morphData.split("\\s+"))
.filter(
s ->
s.length() > 3
&& Character.isLetter(s.charAt(0))
&& Character.isLetter(s.charAt(1))
&& s.charAt(2) == ':')
.collect(Collectors.toList());

List<String> result = null;
int start = 0;
for (int i = 0; i <= morphData.length(); i++) {
if (i == morphData.length() || Character.isWhitespace(morphData.charAt(i))) {
if (i - start > 3
&& Character.isLetter(morphData.charAt(start))
&& Character.isLetter(morphData.charAt(start + 1))
&& morphData.charAt(start + 2) == ':') {
if (result == null) result = new ArrayList<>();
result.add(morphData.substring(start, i));
}
start = i + 1;
}
}
return result == null ? List.of() : result;
}

boolean hasFlag(IntsRef forms, char flag) {
Expand Down

0 comments on commit 4bf1b94

Please sign in to comment.