-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis…
… introspection, stem expansion and stem/flag suggestion (#975)
- Loading branch information
1 parent
3dd9a54
commit d537013
Showing
19 changed files
with
1,351 additions
and
170 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
119 changes: 119 additions & 0 deletions
119
lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.hunspell; | ||
|
||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG; | ||
|
||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Objects; | ||
|
||
/** An object representing the analysis result of a simple (non-compound) word */ | ||
public final class AffixedWord { | ||
private final String word; | ||
private final DictEntry entry; | ||
private final List<Affix> prefixes; | ||
private final List<Affix> suffixes; | ||
|
||
AffixedWord(String word, DictEntry entry, List<Affix> prefixes, List<Affix> suffixes) { | ||
this.word = word; | ||
this.entry = entry; | ||
this.prefixes = Collections.unmodifiableList(prefixes); | ||
this.suffixes = Collections.unmodifiableList(suffixes); | ||
} | ||
|
||
/** @return the word being analyzed */ | ||
public String getWord() { | ||
return word; | ||
} | ||
|
||
/** @return the dictionary entry for the stem in this analysis */ | ||
public DictEntry getDictEntry() { | ||
return entry; | ||
} | ||
|
||
/** @return the list of prefixes applied to the stem, at most two, outermost first */ | ||
public List<Affix> getPrefixes() { | ||
return prefixes; | ||
} | ||
|
||
/** @return the list of suffixes applied to the stem, at most two, outermost first */ | ||
public List<Affix> getSuffixes() { | ||
return suffixes; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (!(o instanceof AffixedWord that)) return false; | ||
return word.equals(that.word) | ||
&& entry.equals(that.entry) | ||
&& prefixes.equals(that.prefixes) | ||
&& suffixes.equals(that.suffixes); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(word, entry, prefixes, suffixes); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "AffixedWord[" | ||
+ ("word=" + word + ", ") | ||
+ ("entry=" + entry + ", ") | ||
+ ("prefixes=" + prefixes + ", ") | ||
+ ("suffixes=" + suffixes) | ||
+ "]"; | ||
} | ||
|
||
/** An object representing a prefix or a suffix applied to a word stem */ | ||
public static final class Affix { | ||
final int affixId; | ||
private final String presentableFlag; | ||
|
||
Affix(Dictionary dictionary, int affixId) { | ||
this.affixId = affixId; | ||
char encodedFlag = dictionary.affixData(affixId, AFFIX_FLAG); | ||
presentableFlag = dictionary.flagParsingStrategy.printFlag(encodedFlag); | ||
} | ||
|
||
/** | ||
* @return the corresponding affix flag as it appears in the *.aff file. Depending on the | ||
* format, it could be a Unicode character, two ASCII characters, or an integer in decimal | ||
* form | ||
*/ | ||
public String getFlag() { | ||
return presentableFlag; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
return this == o || o instanceof Affix a && affixId == a.affixId; | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return affixId; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return presentableFlag + "(id=" + affixId + ")"; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
109 changes: 109 additions & 0 deletions
109
lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.hunspell; | ||
|
||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Objects; | ||
|
||
/** An object representing *.dic file entry with its word, flags and morphological data. */ | ||
public abstract class DictEntry { | ||
private final String stem; | ||
|
||
DictEntry(String stem) { | ||
this.stem = stem; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
String result = stem; | ||
String flags = getFlags(); | ||
if (!flags.isEmpty()) { | ||
result += "/" + flags; | ||
} | ||
String morph = getMorphologicalData(); | ||
if (!morph.isEmpty()) { | ||
result += " " + morph; | ||
} | ||
return result; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (!(o instanceof DictEntry that)) return false; | ||
return stem.equals(that.stem) | ||
&& getMorphologicalData().equals(that.getMorphologicalData()) | ||
&& getFlags().equals(that.getFlags()); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(stem, getFlags(), getMorphologicalData()); | ||
} | ||
|
||
/** @return the stem word in the dictionary */ | ||
public String getStem() { | ||
return stem; | ||
} | ||
|
||
/** | ||
* @return the flags associated with the dictionary entry, encoded in the same format as in the | ||
* *.dic file, but possibly in a different order | ||
*/ | ||
public abstract String getFlags(); | ||
|
||
/** | ||
* @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding | ||
* {@code ph:}) associated with the homonym at the given entry index, or an empty string | ||
*/ | ||
public abstract String getMorphologicalData(); | ||
|
||
/** | ||
* @param key the key in the form {@code kk:} by which to filter the morphological fields | ||
* @return the values (of {@code vvvvvv} form) of morphological fields with the given key | ||
* associated with the homonym at the given entry index | ||
*/ | ||
public List<String> getMorphologicalValues(String key) { | ||
assert key.length() == 3 && key.charAt(2) == ':' | ||
: "A morphological data key should consist of two letters followed by a semicolon, found: " | ||
+ key; | ||
|
||
String data = getMorphologicalData(); | ||
if (data.isEmpty() || !data.contains(key)) return Collections.emptyList(); | ||
|
||
return Arrays.stream(data.split(" ")) | ||
.filter(s -> s.startsWith(key)) | ||
.map(s -> s.substring(3)) | ||
.toList(); | ||
} | ||
|
||
static DictEntry create(String stem, String flags) { | ||
return new DictEntry(stem) { | ||
@Override | ||
public String getFlags() { | ||
return flags; | ||
} | ||
|
||
@Override | ||
public String getMorphologicalData() { | ||
return ""; | ||
} | ||
}; | ||
} | ||
} |
Oops, something went wrong.