Skip to content

Commit

Permalink
LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis…
Browse files Browse the repository at this point in the history
… introspection, stem expansion and stem/flag suggestion (#975)
  • Loading branch information
donnerpeter authored Jul 5, 2022
1 parent 3dd9a54 commit d537013
Show file tree
Hide file tree
Showing 19 changed files with 1,351 additions and 170 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ New Features

* LUCENE-10151 Enable timeout support in IndexSearcher. (Deepika Sharma)

* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;

import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;

import java.util.Collections;
import java.util.List;
import java.util.Objects;

/** An object representing the analysis result of a simple (non-compound) word */
public final class AffixedWord {
private final String word;
private final DictEntry entry;
private final List<Affix> prefixes;
private final List<Affix> suffixes;

AffixedWord(String word, DictEntry entry, List<Affix> prefixes, List<Affix> suffixes) {
this.word = word;
this.entry = entry;
this.prefixes = Collections.unmodifiableList(prefixes);
this.suffixes = Collections.unmodifiableList(suffixes);
}

/** @return the word being analyzed */
public String getWord() {
return word;
}

/** @return the dictionary entry for the stem in this analysis */
public DictEntry getDictEntry() {
return entry;
}

/** @return the list of prefixes applied to the stem, at most two, outermost first */
public List<Affix> getPrefixes() {
return prefixes;
}

/** @return the list of suffixes applied to the stem, at most two, outermost first */
public List<Affix> getSuffixes() {
return suffixes;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof AffixedWord that)) return false;
return word.equals(that.word)
&& entry.equals(that.entry)
&& prefixes.equals(that.prefixes)
&& suffixes.equals(that.suffixes);
}

@Override
public int hashCode() {
return Objects.hash(word, entry, prefixes, suffixes);
}

@Override
public String toString() {
return "AffixedWord["
+ ("word=" + word + ", ")
+ ("entry=" + entry + ", ")
+ ("prefixes=" + prefixes + ", ")
+ ("suffixes=" + suffixes)
+ "]";
}

/** An object representing a prefix or a suffix applied to a word stem */
public static final class Affix {
final int affixId;
private final String presentableFlag;

Affix(Dictionary dictionary, int affixId) {
this.affixId = affixId;
char encodedFlag = dictionary.affixData(affixId, AFFIX_FLAG);
presentableFlag = dictionary.flagParsingStrategy.printFlag(encodedFlag);
}

/**
* @return the corresponding affix flag as it appears in the *.aff file. Depending on the
* format, it could be a Unicode character, two ASCII characters, or an integer in decimal
* form
*/
public String getFlag() {
return presentableFlag;
}

@Override
public boolean equals(Object o) {
return this == o || o instanceof Affix a && affixId == a.affixId;
}

@Override
public int hashCode() {
return affixId;
}

@Override
public String toString() {
return presentableFlag + "(id=" + affixId + ")";
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,22 @@
*
* @see Dictionary#lookupEntries
*/
public interface DictEntries {
public interface DictEntries extends List<DictEntry> {
/**
* @return a positive number of dictionary entries with the same word. Most often it's 1 (unless
* there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can
* be passed into other methods of this class.
*/
@Override
int size();

/**
* @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
* @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
* {@code ph:}) associated with the homonym at the given entry index, or an empty string
*/
String getMorphologicalData(int entryIndex);
/** Same as {@code get(entryIndex).getMorphologicalData()} */
default String getMorphologicalData(int entryIndex) {
return get(entryIndex).getMorphologicalData();
}

/**
* @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
* @param key the key in the form {@code kk:} by which to filter the morphological fields
* @return the values (of {@code vvvvvv} form) of morphological fields with the given key
* associated with the homonym at the given entry index
*/
List<String> getMorphologicalValues(int entryIndex, String key);
/** Same as {@code get(entryIndex).getMorphologicalValues(key)} */
default List<String> getMorphologicalValues(int entryIndex, String key) {
return get(entryIndex).getMorphologicalValues(key);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;

/** An object representing *.dic file entry with its word, flags and morphological data. */
public abstract class DictEntry {
private final String stem;

DictEntry(String stem) {
this.stem = stem;
}

@Override
public String toString() {
String result = stem;
String flags = getFlags();
if (!flags.isEmpty()) {
result += "/" + flags;
}
String morph = getMorphologicalData();
if (!morph.isEmpty()) {
result += " " + morph;
}
return result;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof DictEntry that)) return false;
return stem.equals(that.stem)
&& getMorphologicalData().equals(that.getMorphologicalData())
&& getFlags().equals(that.getFlags());
}

@Override
public int hashCode() {
return Objects.hash(stem, getFlags(), getMorphologicalData());
}

/** @return the stem word in the dictionary */
public String getStem() {
return stem;
}

/**
* @return the flags associated with the dictionary entry, encoded in the same format as in the
* *.dic file, but possibly in a different order
*/
public abstract String getFlags();

/**
* @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
* {@code ph:}) associated with the homonym at the given entry index, or an empty string
*/
public abstract String getMorphologicalData();

/**
* @param key the key in the form {@code kk:} by which to filter the morphological fields
* @return the values (of {@code vvvvvv} form) of morphological fields with the given key
* associated with the homonym at the given entry index
*/
public List<String> getMorphologicalValues(String key) {
assert key.length() == 3 && key.charAt(2) == ':'
: "A morphological data key should consist of two letters followed by a semicolon, found: "
+ key;

String data = getMorphologicalData();
if (data.isEmpty() || !data.contains(key)) return Collections.emptyList();

return Arrays.stream(data.split(" "))
.filter(s -> s.startsWith(key))
.map(s -> s.substring(3))
.toList();
}

static DictEntry create(String stem, String flags) {
return new DictEntry(stem) {
@Override
public String getFlags() {
return flags;
}

@Override
public String getMorphologicalData() {
return "";
}
};
}
}
Loading

0 comments on commit d537013

Please sign in to comment.