ml6team · RobbeSneyders · Jan 3, 2024 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/components/index_aws_opensearch/Dockerfile b/components/index_aws_opensearch/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 python:3.8-slim as base
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=main
+RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component
+COPY src/ src/
+
+FROM base
+WORKDIR /component/src
+ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/components/index_aws_opensearch/README.md b/components/index_aws_opensearch/README.md
@@ -0,0 +1,55 @@
+# Index AWS OpenSearch
+
+### Description
+Component that takes embeddings of text snippets and indexes them into AWS OpenSearch vector database.
+
+### Inputs / outputs
+
+**This component consumes:**
+
+- text: string
+- embedding: list<item: float>
+
+**This component produces no data.**
+
+### Arguments
+
+The component takes the following arguments to alter its behavior:
+
+| argument | type | description | default |
+| -------- | ---- | ----------- | ------- |
+| host | str | The Cluster endpoint of the AWS OpenSearch cluster where the embeddings will be indexed. For example, "my-opensearch-cluster.us-east-1.es.amazonaws.com" | / |
+| region | str | The AWS region where the OpenSearch cluster is located. If not specified, the default region will be used. | / |
+| index_name | str | The name of the index in the AWS OpenSearch cluster where the embeddings will be stored. | / |
+| index_body | dict | A dictionary representing the body of the index request. This can include additional settings for the index operation. | / |
+| port | int | The port number to connect to the AWS OpenSearch cluster. | 443 |
+| use_ssl | bool | A boolean flag indicating whether to use SSL/TLS for the connection to the OpenSearch cluster. | True |
+| verify_certs | bool | A boolean flag indicating whether to verify SSL certificates when connecting to the OpenSearch cluster. | True |
+| pool_maxsize | int | The maximum size of the connection pool to the AWS OpenSearch cluster. | 20 |
+
+### Usage
+
+You can add this component to your pipeline using the following code:
+
+```python
+from fondant.pipeline import Pipeline
+
+pipeline = Pipeline(...)
+
+dataset = pipeline.read(...)
+
+dataset = dataset.apply(...)
+
+dataset.write(
+    "index_aws_opensearch",
+    arguments={
+        # Add arguments
+        # "host": "my-opensearch-cluster.us-east-1.es.amazonaws.com",
+        # "region": "eu-west-1",
+        # "index_name": "test-index",
+        # "port": 443,
+        # "use_ssl": True,
+        # "verify_certs": True,
+        # "pool_maxsize": 20,
+    }
+)
diff --git a/components/index_aws_opensearch/fondant_component.yaml b/components/index_aws_opensearch/fondant_component.yaml
@@ -0,0 +1,44 @@
+name: Index AWS OpenSearch
+description: Component that takes embeddings of text snippets and indexes them into AWS OpenSearch vector database.
+image: fndnt/index_aws_opensearch:dev
+tags:
+  - Data writing
+
+consumes:
+  text:
+    type: string
+  embedding:
+    type: array
+    items:
+      type: float32
+
+args:
+  host:
+    description: The Cluster endpoint of the AWS OpenSearch cluster where the embeddings will be indexed. E.g. "my-test-domain.us-east-1.aoss.amazonaws.com"
+    type: str
+  region:
+    description:  The AWS region where the OpenSearch cluster is located. If not specified, the default region will be used.
+    type: str
+  index_name:
+    description: The name of the index in the AWS OpenSearch cluster where the embeddings will be stored.
+    type: str
+  index_body:
+    description: Parameters that specify index settings, mappings, and aliases for newly created index.
+    type: dict
+  port:
+    description: The port number to connect to the AWS OpenSearch cluster.
+    type: int
+    default: 443
+  use_ssl:
+    description: A boolean flag indicating whether to use SSL/TLS for the connection to the OpenSearch cluster.
+    type: bool
+    default: True
+  verify_certs:
+    description: A boolean flag indicating whether to verify SSL certificates when connecting to the OpenSearch cluster.
+    type: bool
+    default: True
+  pool_maxsize:
+    description: The maximum size of the connection pool to the AWS OpenSearch cluster.
+    type: int
+    default: 20
+
diff --git a/components/index_aws_opensearch/requirements.txt b/components/index_aws_opensearch/requirements.txt
@@ -0,0 +1,2 @@
+boto3==1.34.4
+opensearch-py==2.4.2
diff --git a/components/index_aws_opensearch/src/main.py b/components/index_aws_opensearch/src/main.py
@@ -0,0 +1,61 @@
+from typing import Dict, Any
+import dask.dataframe as dd
+from fondant.component import DaskWriteComponent
+import boto3
+from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
+
+class IndexAWSOpenSearchComponent(DaskWriteComponent):
+    def __init__(
+        self,
+        host: str,
+        region: str,
+        index_name: str,
+        index_body: Dict[str, Any],
+        port: int = 443,
+        use_ssl: bool = True,
+        verify_certs: bool = True,
+        pool_maxsize: int = 20,
+        **kwargs,
+    ):
+        session = boto3.Session()
+        credentials = session.get_credentials()
+        auth = AWSV4SignerAuth(credentials, region)
+        self.index_name = index_name
+        self.client = OpenSearch(
+            hosts=[{"host": host, "port": port}],
+            http_auth=auth,
+            use_ssl=use_ssl,
+            verify_certs=verify_certs,
+            connection_class=RequestsHttpConnection,
+            pool_maxsize=pool_maxsize,
+            **kwargs,
+        )
+        self.create_index(index_body)
+
+    def create_index(self, index_body: Dict[str, Any]):
+        """Creates an index in AWS OpenSearch
+
+        Args:
+            index_body (Dict[str, Any]): Parameters that specify index settings, mappings, and aliases for newly created index.
+        """
+        response = self.client.indices.create(self.index_name, body=index_body)
+
+    def write(self, dataframe: dd.DataFrame):
+        """
+        Writes the data from the given Dask DataFrame to AWS OpenSearch Index.
+        Args:
+            dataframe (dd.DataFrame): The Dask DataFrame containing the data to be written.
-            dataframe (dd.DataFrame): The Dask DataFrame containing the data to be written.
+            dataframe: The Dask DataFrame containing the data to be written.
-            dataframe (dd.DataFrame): The Dask DataFrame containing the data to be written.
+            dataframe: The Dask DataFrame containing the data to be written.
+        """
+        if not self.client.indices.exists(index=self.index_name):
+            raise ValueError(f"Index: {self.index_name} doesn't exist. Please Create")
+
+        for part in dataframe.partitions:
-        for part in dataframe.partitions:
+            for part in tqdm(
+                dataframe.partitions,
+                desc="Processing partitions",
+                total=dataframe.npartitions,
+            ):
-        for part in dataframe.partitions:
+            for part in tqdm(
+                dataframe.partitions,
+                desc="Processing partitions",
+                total=dataframe.npartitions,
+            ):
+            df = part.compute()
+            for row in df.itertuples():
+                body = {
+                    "embedding": row.embedding,
+                    "text": row.text
+                }
+                response = self.client.index(
+                    index=self.index_name, id=str(row.Index), body=body
+                )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		boto3==1.34.4
PhilippeMoussalli marked this conversation as resolved. Show resolved Hide resolved
		opensearch-py==2.4.2