Introduced the Word2VecSynonymFilter (#12169)

Co-authored-by: Alessandro Benedetti <[email protected]>
apache · May 30, 2023 · 64b48b8 · 64b48b8
1 parent 24df30c
commit 64b48b8
Show file tree

Hide file tree

Showing 23 changed files with 1,448 additions and 23 deletions.
diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java
@@ -89,6 +89,8 @@
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.stempel.StempelStemmer;
 import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.synonym.word2vec.Word2VecModel;
+import org.apache.lucene.analysis.synonym.word2vec.Word2VecSynonymProvider;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.tests.analysis.MockTokenFilter;
@@ -99,8 +101,10 @@
 import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IgnoreRandomChains;
+import org.apache.lucene.util.TermAndVector;
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@@ -415,6 +419,27 @@ private String randomNonEmptyString(Random random) {
                       }
                     }
                   });
+              put(
+                  Word2VecSynonymProvider.class,
+                  random -> {
+                    final int numEntries = atLeast(10);
+                    final int vectorDimension = random.nextInt(99) + 1;
+                    Word2VecModel model = new Word2VecModel(numEntries, vectorDimension);
+                    for (int j = 0; j < numEntries; j++) {
+                      String s = TestUtil.randomSimpleString(random, 10, 20);
+                      float[] vec = new float[vectorDimension];
+                      for (int i = 0; i < vectorDimension; i++) {
+                        vec[i] = random.nextFloat();
+                      }
+                      model.addTermAndVector(new TermAndVector(new BytesRef(s), vec));
+                    }
+                    try {
+                      return new Word2VecSynonymProvider(model);
+                    } catch (IOException e) {
+                      Rethrow.rethrow(e);
+                      return null; // unreachable code
+                    }
+                  });
               put(
                   DateFormat.class,
                   random -> {

diff --git a/lucene/analysis/common/src/java/module-info.java b/lucene/analysis/common/src/java/module-info.java
@@ -78,6 +78,7 @@
   exports org.apache.lucene.analysis.sr;
   exports org.apache.lucene.analysis.sv;
   exports org.apache.lucene.analysis.synonym;
+  exports org.apache.lucene.analysis.synonym.word2vec;
   exports org.apache.lucene.analysis.ta;
   exports org.apache.lucene.analysis.te;
   exports org.apache.lucene.analysis.th;
@@ -256,6 +257,7 @@
       org.apache.lucene.analysis.sv.SwedishMinimalStemFilterFactory,
       org.apache.lucene.analysis.synonym.SynonymFilterFactory,
       org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory,
+      org.apache.lucene.analysis.synonym.word2vec.Word2VecSynonymFilterFactory,
       org.apache.lucene.analysis.core.FlattenGraphFilterFactory,
       org.apache.lucene.analysis.te.TeluguNormalizationFilterFactory,
       org.apache.lucene.analysis.te.TeluguStemFilterFactory,

diff --git a/...analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Dl4jModelReader.java b/...analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Dl4jModelReader.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym.word2vec;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.Base64;
+import java.util.Locale;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.TermAndVector;
+
+/**
+ * Dl4jModelReader reads the file generated by the library Deeplearning4j and provide a
+ * Word2VecModel with normalized vectors
+ *
+ * <p>Dl4j Word2Vec documentation:
+ * https://deeplearning4j.konduit.ai/v/en-1.0.0-beta7/language-processing/word2vec Example to
+ * generate a model using dl4j:
+ * https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/advanced/modelling/embeddingsfromcorpus/word2vec/Word2VecRawTextExample.java
+ *
+ * @lucene.experimental
+ */
+public class Dl4jModelReader implements Closeable {
+
+  private static final String MODEL_FILE_NAME_PREFIX = "syn0";
+
+  private final ZipInputStream word2VecModelZipFile;
+
+  public Dl4jModelReader(InputStream stream) {
+    this.word2VecModelZipFile = new ZipInputStream(new BufferedInputStream(stream));
+  }
+
+  public Word2VecModel read() throws IOException {
+
+    ZipEntry entry;
+    while ((entry = word2VecModelZipFile.getNextEntry()) != null) {
+      String fileName = entry.getName();
+      if (fileName.startsWith(MODEL_FILE_NAME_PREFIX)) {
+        BufferedReader reader =
+            new BufferedReader(new InputStreamReader(word2VecModelZipFile, StandardCharsets.UTF_8));
+
+        String header = reader.readLine();
+        String[] headerValues = header.split(" ");
+        int dictionarySize = Integer.parseInt(headerValues[0]);
+        int vectorDimension = Integer.parseInt(headerValues[1]);
+
+        Word2VecModel model = new Word2VecModel(dictionarySize, vectorDimension);
+        String line = reader.readLine();
+        boolean isTermB64Encoded = false;
+        if (line != null) {
+          String[] tokens = line.split(" ");
+          isTermB64Encoded =
+              tokens[0].substring(0, 3).toLowerCase(Locale.ROOT).compareTo("b64") == 0;
+          model.addTermAndVector(extractTermAndVector(tokens, vectorDimension, isTermB64Encoded));
+        }
+        while ((line = reader.readLine()) != null) {
+          String[] tokens = line.split(" ");
+          model.addTermAndVector(extractTermAndVector(tokens, vectorDimension, isTermB64Encoded));
+        }
+        return model;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Cannot read Dl4j word2vec model - '"
+            + MODEL_FILE_NAME_PREFIX
+            + "' file is missing in the zip. '"
+            + MODEL_FILE_NAME_PREFIX
+            + "' is a mandatory file containing the mapping between terms and vectors generated by the DL4j library.");
+  }
+
+  private static TermAndVector extractTermAndVector(
+      String[] tokens, int vectorDimension, boolean isTermB64Encoded) {
+    BytesRef term = isTermB64Encoded ? decodeB64Term(tokens[0]) : new BytesRef((tokens[0]));
+
+    float[] vector = new float[tokens.length - 1];
+
+    if (vectorDimension != vector.length) {
+      throw new RuntimeException(
+          String.format(
+              Locale.ROOT,
+              "Word2Vec model file corrupted. "
+                  + "Declared vectors of size %d but found vector of size %d for word %s (%s)",
+              vectorDimension,
+              vector.length,
+              tokens[0],
+              term.utf8ToString()));
+    }
+
+    for (int i = 1; i < tokens.length; i++) {
+      vector[i - 1] = Float.parseFloat(tokens[i]);
+    }
+    return new TermAndVector(term, vector);
+  }
+
+  static BytesRef decodeB64Term(String term) {
+    byte[] buffer = Base64.getDecoder().decode(term.substring(4));
+    return new BytesRef(buffer, 0, buffer.length);
+  }
+
+  @Override
+  public void close() throws IOException {
+    word2VecModelZipFile.close();
+  }
+}
diff --git a/...e/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java b/...e/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym.word2vec;
+
+import java.io.IOException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.TermAndVector;
+import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
+
+/**
+ * Word2VecModel is a class representing the parsed Word2Vec model containing the vectors for each
+ * word in dictionary
+ *
+ * @lucene.experimental
+ */
+public class Word2VecModel implements RandomAccessVectorValues<float[]> {
+
+  private final int dictionarySize;
+  private final int vectorDimension;
+  private final TermAndVector[] termsAndVectors;
+  private final BytesRefHash word2Vec;
+  private int loadedCount = 0;
+
+  public Word2VecModel(int dictionarySize, int vectorDimension) {
+    this.dictionarySize = dictionarySize;
+    this.vectorDimension = vectorDimension;
+    this.termsAndVectors = new TermAndVector[dictionarySize];
+    this.word2Vec = new BytesRefHash();
+  }
+
+  private Word2VecModel(
+      int dictionarySize,
+      int vectorDimension,
+      TermAndVector[] termsAndVectors,
+      BytesRefHash word2Vec) {
+    this.dictionarySize = dictionarySize;
+    this.vectorDimension = vectorDimension;
+    this.termsAndVectors = termsAndVectors;
+    this.word2Vec = word2Vec;
+  }
+
+  public void addTermAndVector(TermAndVector modelEntry) {
+    modelEntry.normalizeVector();
+    this.termsAndVectors[loadedCount++] = modelEntry;
+    this.word2Vec.add(modelEntry.getTerm());
+  }
+
+  @Override
+  public float[] vectorValue(int targetOrd) {
+    return termsAndVectors[targetOrd].getVector();
+  }
+
+  public float[] vectorValue(BytesRef term) {
+    int termOrd = this.word2Vec.find(term);
+    if (termOrd < 0) return null;
+    TermAndVector entry = this.termsAndVectors[termOrd];
+    return (entry == null) ? null : entry.getVector();
+  }
+
+  public BytesRef termValue(int targetOrd) {
+    return termsAndVectors[targetOrd].getTerm();
+  }
+
+  @Override
+  public int dimension() {
+    return vectorDimension;
+  }
+
+  @Override
+  public int size() {
+    return dictionarySize;
+  }
+
+  @Override
+  public RandomAccessVectorValues<float[]> copy() throws IOException {
+    return new Word2VecModel(
+        this.dictionarySize, this.vectorDimension, this.termsAndVectors, this.word2Vec);
+  }
+}
diff --git a/...is/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecSynonymFilter.java b/...is/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecSynonymFilter.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym.word2vec;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.TermAndBoost;
+
+/**
+ * Applies single-token synonyms from a Word2Vec trained network to an incoming {@link TokenStream}.
+ *
+ * @lucene.experimental
+ */
+public final class Word2VecSynonymFilter extends TokenFilter {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncrementAtt =
+      addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+  private final Word2VecSynonymProvider synonymProvider;
+  private final int maxSynonymsPerTerm;
+  private final float minAcceptedSimilarity;
+  private final LinkedList<TermAndBoost> synonymBuffer = new LinkedList<>();
+  private State lastState;
+
+  /**
+   * Apply previously built synonymProvider to incoming tokens.
+   *
+   * @param input input tokenstream
+   * @param synonymProvider synonym provider
+   * @param maxSynonymsPerTerm maximum number of result returned by the synonym search
+   * @param minAcceptedSimilarity minimal value of cosine similarity between the searched vector and
+   *     the retrieved ones
+   */
+  public Word2VecSynonymFilter(
+      TokenStream input,
+      Word2VecSynonymProvider synonymProvider,
+      int maxSynonymsPerTerm,
+      float minAcceptedSimilarity) {
+    super(input);
+    this.synonymProvider = synonymProvider;
+    this.maxSynonymsPerTerm = maxSynonymsPerTerm;
+    this.minAcceptedSimilarity = minAcceptedSimilarity;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+
+    if (!synonymBuffer.isEmpty()) {
+      TermAndBoost synonym = synonymBuffer.pollFirst();
+      clearAttributes();
+      restoreState(this.lastState);
+      termAtt.setEmpty();
+      termAtt.append(synonym.term.utf8ToString());
+      typeAtt.setType(SynonymGraphFilter.TYPE_SYNONYM);
+      posLenAtt.setPositionLength(1);
+      posIncrementAtt.setPositionIncrement(0);
+      return true;
+    }
+
+    if (input.incrementToken()) {
+      BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
+      bytesRefBuilder.copyChars(termAtt.buffer(), 0, termAtt.length());
+      BytesRef term = bytesRefBuilder.get();
+      List<TermAndBoost> synonyms =
+          this.synonymProvider.getSynonyms(term, maxSynonymsPerTerm, minAcceptedSimilarity);
+      if (synonyms.size() > 0) {
+        this.lastState = captureState();
+        this.synonymBuffer.addAll(synonyms);
+      }
+      return true;
+    }
+    return false;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    synonymBuffer.clear();
+  }
+}