LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis…

… introspection, stem expansion and stem/flag suggestion (#975)
apache · Jul 5, 2022 · d537013 · d537013
1 parent 3dd9a54
commit d537013
Show file tree

Hide file tree

Showing 19 changed files with 1,351 additions and 170 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -36,6 +36,9 @@ New Features
 
 * LUCENE-10151 Enable timeout support in IndexSearcher. (Deepika Sharma)
 
+* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
+  analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)
+
 Improvements
 ---------------------
 

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixedWord.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/** An object representing the analysis result of a simple (non-compound) word */
+public final class AffixedWord {
+  private final String word;
+  private final DictEntry entry;
+  private final List<Affix> prefixes;
+  private final List<Affix> suffixes;
+
+  AffixedWord(String word, DictEntry entry, List<Affix> prefixes, List<Affix> suffixes) {
+    this.word = word;
+    this.entry = entry;
+    this.prefixes = Collections.unmodifiableList(prefixes);
+    this.suffixes = Collections.unmodifiableList(suffixes);
+  }
+
+  /** @return the word being analyzed */
+  public String getWord() {
+    return word;
+  }
+
+  /** @return the dictionary entry for the stem in this analysis */
+  public DictEntry getDictEntry() {
+    return entry;
+  }
+
+  /** @return the list of prefixes applied to the stem, at most two, outermost first */
+  public List<Affix> getPrefixes() {
+    return prefixes;
+  }
+
+  /** @return the list of suffixes applied to the stem, at most two, outermost first */
+  public List<Affix> getSuffixes() {
+    return suffixes;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (!(o instanceof AffixedWord that)) return false;
+    return word.equals(that.word)
+        && entry.equals(that.entry)
+        && prefixes.equals(that.prefixes)
+        && suffixes.equals(that.suffixes);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(word, entry, prefixes, suffixes);
+  }
+
+  @Override
+  public String toString() {
+    return "AffixedWord["
+        + ("word=" + word + ", ")
+        + ("entry=" + entry + ", ")
+        + ("prefixes=" + prefixes + ", ")
+        + ("suffixes=" + suffixes)
+        + "]";
+  }
+
+  /** An object representing a prefix or a suffix applied to a word stem */
+  public static final class Affix {
+    final int affixId;
+    private final String presentableFlag;
+
+    Affix(Dictionary dictionary, int affixId) {
+      this.affixId = affixId;
+      char encodedFlag = dictionary.affixData(affixId, AFFIX_FLAG);
+      presentableFlag = dictionary.flagParsingStrategy.printFlag(encodedFlag);
+    }
+
+    /**
+     * @return the corresponding affix flag as it appears in the *.aff file. Depending on the
+     *     format, it could be a Unicode character, two ASCII characters, or an integer in decimal
+     *     form
+     */
+    public String getFlag() {
+      return presentableFlag;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      return this == o || o instanceof Affix a && affixId == a.affixId;
+    }
+
+    @Override
+    public int hashCode() {
+      return affixId;
+    }
+
+    @Override
+    public String toString() {
+      return presentableFlag + "(id=" + affixId + ")";
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java
@@ -24,26 +24,22 @@
  *
  * @see Dictionary#lookupEntries
  */
-public interface DictEntries {
+public interface DictEntries extends List<DictEntry> {
   /**
    * @return a positive number of dictionary entries with the same word. Most often it's 1 (unless
    *     there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can
    *     be passed into other methods of this class.
    */
+  @Override
   int size();
 
-  /**
-   * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
-   * @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
-   *     {@code ph:}) associated with the homonym at the given entry index, or an empty string
-   */
-  String getMorphologicalData(int entryIndex);
+  /** Same as {@code get(entryIndex).getMorphologicalData()} */
+  default String getMorphologicalData(int entryIndex) {
+    return get(entryIndex).getMorphologicalData();
+  }
 
-  /**
-   * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
-   * @param key the key in the form {@code kk:} by which to filter the morphological fields
-   * @return the values (of {@code vvvvvv} form) of morphological fields with the given key
-   *     associated with the homonym at the given entry index
-   */
-  List<String> getMorphologicalValues(int entryIndex, String key);
+  /** Same as {@code get(entryIndex).getMorphologicalValues(key)} */
+  default List<String> getMorphologicalValues(int entryIndex, String key) {
+    return get(entryIndex).getMorphologicalValues(key);
+  }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntry.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/** An object representing *.dic file entry with its word, flags and morphological data. */
+public abstract class DictEntry {
+  private final String stem;
+
+  DictEntry(String stem) {
+    this.stem = stem;
+  }
+
+  @Override
+  public String toString() {
+    String result = stem;
+    String flags = getFlags();
+    if (!flags.isEmpty()) {
+      result += "/" + flags;
+    }
+    String morph = getMorphologicalData();
+    if (!morph.isEmpty()) {
+      result += " " + morph;
+    }
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (!(o instanceof DictEntry that)) return false;
+    return stem.equals(that.stem)
+        && getMorphologicalData().equals(that.getMorphologicalData())
+        && getFlags().equals(that.getFlags());
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(stem, getFlags(), getMorphologicalData());
+  }
+
+  /** @return the stem word in the dictionary */
+  public String getStem() {
+    return stem;
+  }
+
+  /**
+   * @return the flags associated with the dictionary entry, encoded in the same format as in the
+   *     *.dic file, but possibly in a different order
+   */
+  public abstract String getFlags();
+
+  /**
+   * @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
+   *     {@code ph:}) associated with the homonym at the given entry index, or an empty string
+   */
+  public abstract String getMorphologicalData();
+
+  /**
+   * @param key the key in the form {@code kk:} by which to filter the morphological fields
+   * @return the values (of {@code vvvvvv} form) of morphological fields with the given key
+   *     associated with the homonym at the given entry index
+   */
+  public List<String> getMorphologicalValues(String key) {
+    assert key.length() == 3 && key.charAt(2) == ':'
+        : "A morphological data key should consist of two letters followed by a semicolon, found: "
+            + key;
+
+    String data = getMorphologicalData();
+    if (data.isEmpty() || !data.contains(key)) return Collections.emptyList();
+
+    return Arrays.stream(data.split(" "))
+        .filter(s -> s.startsWith(key))
+        .map(s -> s.substring(3))
+        .toList();
+  }
+
+  static DictEntry create(String stem, String flags) {
+    return new DictEntry(stem) {
+      @Override
+      public String getFlags() {
+        return flags;
+      }
+
+      @Override
+      public String getMorphologicalData() {
+        return "";
+      }
+    };
+  }
+}