# This is a combination of 14 commits.

# This is the 1st commit message: Add text embedding processor to neural search Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#2 will be skipped: # Code format # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#3 will be skipped: # Address review comments # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#4 will be skipped: # Add blocking text embedding method for pipeline processor # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#5 will be skipped: # Add BaseNeuralSearchIT and address other review comments # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#6 will be skipped: # Add BaseNeuralSearchIT and address other review comments # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#7 will be skipped: # Add BaseNeuralSearchIT and address other review comments # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#8 will be skipped: # Fix naming convention and IT function move to base # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#9 will be skipped: # Fix naming convention and IT function move to base # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#10 will be skipped: # Update src/main/java/org/opensearch/neuralsearch/ml/MLCommonsClientAccessor.java # # Co-authored-by: Navneet Verma <[email protected]> # The commit message opensearch-project#11 will be skipped: # Update src/main/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessor.java # # Co-authored-by: Navneet Verma <[email protected]> # The commit message opensearch-project#12 will be skipped: # Fix code review comments # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#13 will be skipped: # Fix text embedding processor NPE # # Signed-off-by: Zan Niu <[email protected]> # The commit message opensearch-project#14 will be skipped: # Remove jackson dependencies and fix tests with XCoontent # # Signed-off-by: Zan Niu <[email protected]>
zane-neo · Oct 20, 2022 · 0dab8b1 · 0dab8b1
1 parent 272d803
commit 0dab8b1
Show file tree

Hide file tree

Showing 13 changed files with 1,117 additions and 26 deletions.
diff --git a/build.gradle b/build.gradle
@@ -5,7 +5,9 @@
  * Learn more about Gradle by exploring our samples at https://docs.gradle.org/7.5.1/samples
  * This project uses @Incubating APIs which are subject to change.
  */
+
 import org.opensearch.gradle.test.RestIntegTestTask
+
 import java.util.concurrent.Callable
 
 apply plugin: 'java'
@@ -137,6 +139,9 @@ dependencies {
     zipArchive group: 'org.opensearch.plugin', name:'opensearch-knn', version: "${opensearch_build}"
     compileOnly fileTree(dir: knnJarDirectory, include: '*.jar')
     api group: 'org.opensearch', name:'opensearch-ml-client', version: "${opensearch_build}"
+    implementation group: 'org.apache.commons', name: 'commons-lang3', version: '3.10'
+
+    testImplementation group: 'commons-io', name: 'commons-io', version: '2.11.0'
 }
 
 // From maven, we can get the k-NN plugin as a zip. In order to add the jar to the classpath, we need to unzip the

diff --git a/src/main/java/org/opensearch/neuralsearch/ml/MLCommonsClientAccessor.java b/src/main/java/org/opensearch/neuralsearch/ml/MLCommonsClientAccessor.java
@@ -5,27 +5,29 @@
 
 package org.opensearch.neuralsearch.ml;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.stream.Collectors;
-
 import lombok.NonNull;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.log4j.Log4j2;
-
+import org.opensearch.action.ActionFuture;
 import org.opensearch.action.ActionListener;
 import org.opensearch.ml.client.MachineLearningNodeClient;
 import org.opensearch.ml.common.FunctionName;
 import org.opensearch.ml.common.dataset.MLInputDataset;
 import org.opensearch.ml.common.dataset.TextDocsInputDataSet;
 import org.opensearch.ml.common.input.MLInput;
 import org.opensearch.ml.common.model.MLModelTaskType;
+import org.opensearch.ml.common.output.MLOutput;
 import org.opensearch.ml.common.output.model.ModelResultFilter;
 import org.opensearch.ml.common.output.model.ModelTensor;
 import org.opensearch.ml.common.output.model.ModelTensorOutput;
 import org.opensearch.ml.common.output.model.ModelTensors;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.stream.Collectors;
+
 /**
  * This class will act as an abstraction on the MLCommons client for accessing the ML Capabilities
  */
@@ -99,23 +101,54 @@ public void inferenceSentences(
         @NonNull final List<String> inputText,
         @NonNull final ActionListener<List<List<Float>>> listener
     ) {
-        final ModelResultFilter modelResultFilter = new ModelResultFilter(false, true, targetResponseFilters, null);
-        final MLInputDataset inputDataset = new TextDocsInputDataSet(inputText, modelResultFilter);
-        final MLInput mlInput = new MLInput(FunctionName.TEXT_EMBEDDING, null, inputDataset, MLModelTaskType.TEXT_EMBEDDING);
-        final List<List<Float>> vector = new ArrayList<>();
-
+        MLInput mlInput = createMLInput(targetResponseFilters, inputText);
         mlClient.predict(modelId, mlInput, ActionListener.wrap(mlOutput -> {
-            final ModelTensorOutput modelTensorOutput = (ModelTensorOutput) mlOutput;
-            final List<ModelTensors> tensorOutputList = modelTensorOutput.getMlModelOutputs();
-            for (final ModelTensors tensors : tensorOutputList) {
-                final List<ModelTensor> tensorsList = tensors.getMlModelTensors();
-                for (final ModelTensor tensor : tensorsList) {
-                    vector.add(Arrays.stream(tensor.getData()).map(value -> (Float) value).collect(Collectors.toList()));
-                }
-            }
+            final List<List<Float>> vector = buildVectorFromResponse(mlOutput);
             log.debug("Inference Response for input sentence {} is : {} ", inputText, vector);
             listener.onResponse(vector);
         }, listener::onFailure));
     }
 
+    /**
+     * Abstraction to call predict function of api of MLClient with provided targetResponseFilters. It uses the
+     * custom model provided as modelId and run the {@link MLModelTaskType#TEXT_EMBEDDING}. The return will be sent
+     * using the actionListener which will have a {@link List} of {@link List} of {@link Float} in the order of
+     * inputText. We are not making this function generic enough to take any function or TaskType as currently we need
+     * to run only TextEmbedding tasks only. Please note this method is a blocking method, use this only when the processing
+     * needs block waiting for response, otherwise please use {@link #inferenceSentences(String, List, ActionListener)}
+     * instead.
+     * @param modelId {@link String}
+     * @param inputText {@link List} of {@link String} on which inference needs to happen.
+     * @return {@link List} of {@link List} of {@link String} represents the text embedding vector result.
+     * @throws ExecutionException If the underlying task failed, this exception will be thrown in the future.get().
+     * @throws InterruptedException If the thread is interrupted, this will be thrown.
+     */
+    public List<List<Float>> inferenceSentences(@NonNull final String modelId, @NonNull final List<String> inputText)
+        throws ExecutionException, InterruptedException {
+        final MLInput mlInput = createMLInput(TARGET_RESPONSE_FILTERS, inputText);
+        final ActionFuture<MLOutput> outputActionFuture = mlClient.predict(modelId, mlInput);
+        final List<List<Float>> vector = buildVectorFromResponse(outputActionFuture.get());
+        log.debug("Inference Response for input sentence {} is : {} ", inputText, vector);
+        return vector;
+    }
+
+    private MLInput createMLInput(final List<String> targetResponseFilters, List<String> inputText) {
+        final ModelResultFilter modelResultFilter = new ModelResultFilter(false, true, targetResponseFilters, null);
+        final MLInputDataset inputDataset = new TextDocsInputDataSet(inputText, modelResultFilter);
+        return new MLInput(FunctionName.TEXT_EMBEDDING, null, inputDataset, MLModelTaskType.TEXT_EMBEDDING);
+    }
+
+    private List<List<Float>> buildVectorFromResponse(MLOutput mlOutput) {
+        final List<List<Float>> vector = new ArrayList<>();
+        final ModelTensorOutput modelTensorOutput = (ModelTensorOutput) mlOutput;
+        final List<ModelTensors> tensorOutputList = modelTensorOutput.getMlModelOutputs();
+        for (final ModelTensors tensors : tensorOutputList) {
+            final List<ModelTensor> tensorsList = tensors.getMlModelTensors();
+            for (final ModelTensor tensor : tensorsList) {
+                vector.add(Arrays.stream(tensor.getData()).map(value -> (Float) value).collect(Collectors.toList()));
+            }
+        }
+        return vector;
+    }
+
 }
diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
@@ -8,6 +8,7 @@
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.function.Supplier;
 
 import org.opensearch.action.ActionRequest;
@@ -19,12 +20,16 @@
 import org.opensearch.common.xcontent.NamedXContentRegistry;
 import org.opensearch.env.Environment;
 import org.opensearch.env.NodeEnvironment;
+import org.opensearch.ingest.Processor;
 import org.opensearch.ml.client.MachineLearningNodeClient;
 import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor;
 import org.opensearch.neuralsearch.plugin.query.NeuralQueryBuilder;
+import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor;
+import org.opensearch.neuralsearch.processor.factory.TextEmbeddingProcessorFactory;
 import org.opensearch.neuralsearch.transport.MLPredictAction;
 import org.opensearch.neuralsearch.transport.MLPredictTransportAction;
 import org.opensearch.plugins.ActionPlugin;
+import org.opensearch.plugins.IngestPlugin;
 import org.opensearch.plugins.Plugin;
 import org.opensearch.plugins.SearchPlugin;
 import org.opensearch.repositories.RepositoriesService;
@@ -35,7 +40,7 @@
 /**
  * Neural Search plugin class
  */
-public class NeuralSearch extends Plugin implements ActionPlugin, SearchPlugin {
+public class NeuralSearch extends Plugin implements ActionPlugin, SearchPlugin, IngestPlugin {
 
     @Override
     public Collection<Object> createComponents(
@@ -72,4 +77,11 @@ public List<QuerySpec<?>> getQueries() {
             new QuerySpec<>(NeuralQueryBuilder.NAME, NeuralQueryBuilder::new, NeuralQueryBuilder::fromXContent)
         );
     }
+
+    @Override
+    public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
+        final MachineLearningNodeClient machineLearningNodeClient = new MachineLearningNodeClient(parameters.client);
+        final MLCommonsClientAccessor clientAccessor = new MLCommonsClientAccessor(machineLearningNodeClient);
+        return Collections.singletonMap(TextEmbeddingProcessor.TYPE, new TextEmbeddingProcessorFactory(clientAccessor));
+    }
 }