diff --git a/components/evaluate_ragas/Dockerfile b/components/evaluate_ragas/Dockerfile
deleted file mode 100644
index a5c3fa17a..000000000
--- a/components/evaluate_ragas/Dockerfile
+++ /dev/null
@@ -1,30 +0,0 @@
-FROM --platform=linux/amd64 python:3.8-slim as base
-
-# System dependencies
-RUN apt-get update && \
- apt-get upgrade -y && \
- apt-get install git -y
-
-# Install requirements
-COPY requirements.txt /
-RUN pip3 install --no-cache-dir -r requirements.txt
-
-# Install Fondant
-# This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
-RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
-
-# Set the working directory to the component folder
-WORKDIR /component
-COPY src/ src/
-
-FROM base as test
-COPY tests/ tests/
-RUN pip3 install --no-cache-dir -r tests/requirements.txt
-ARG OPENAI_KEY
-ENV OPENAI_KEY=${OPENAI_KEY}
-RUN python -m pytest tests
-
-FROM base
-WORKDIR /component/src
-ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/components/evaluate_ragas/README.md b/components/evaluate_ragas/README.md
deleted file mode 100644
index fbaf13a3e..000000000
--- a/components/evaluate_ragas/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Evaluate ragas
-
-
-## Description
-Component that evaluates the retriever using RAGAS
-
-
-## Inputs / outputs
-
-
-### Consumes
-**This component consumes:**
-
-- question: string
-- retrieved_chunks: list
-
-
-
-
-
-### Produces
-
-**This component can produce additional fields**
-- :
-This defines a mapping to update the fields produced by the operation as defined in the component spec.
-The keys are the names of the fields to be produced by the component, while the values are
-the type of the field that should be used to write the output dataset.
-
-
-
-## Arguments
-
-The component takes the following arguments to alter its behavior:
-
-| argument | type | description | default |
-| -------- | ---- | ----------- | ------- |
-| llm_module_name | str | Module from which the LLM is imported. Defaults to langchain.llms | langchain.chat_models |
-| llm_class_name | str | Name of the selected llm | ChatOpenAI |
-| llm_kwargs | dict | Arguments of the selected llm | {'model_name': 'gpt-3.5-turbo'} |
-
-
-## Usage
-
-You can add this component to your pipeline using the following code:
-
-```python
-from fondant.pipeline import Pipeline
-
-
-pipeline = Pipeline(...)
-
-dataset = pipeline.read(...)
-
-dataset = dataset.apply(
- "evaluate_ragas",
- arguments={
- # Add arguments
- # "llm_module_name": "langchain.chat_models",
- # "llm_class_name": "ChatOpenAI",
- # "llm_kwargs": {'model_name': 'gpt-3.5-turbo'},
- },
- produces={
- : ,
- ..., # Add fields
- },
-)
-```
-
-
-## Testing
-
-You can run the tests using docker with BuildKit. From this directory, run:
-```
-docker build . --target test
-```
diff --git a/components/evaluate_ragas/fondant_component.yaml b/components/evaluate_ragas/fondant_component.yaml
deleted file mode 100644
index bca9d404b..000000000
--- a/components/evaluate_ragas/fondant_component.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Evaluate ragas
-description: Component that evaluates the retriever using RAGAS
-image: fndnt/evaluate_ragas:dev
-tags:
- - Text processing
-
-consumes:
- question:
- type: string
- retrieved_chunks:
- type: array
- items:
- type: string
-
-produces:
- additionalProperties: true
- # Overwrite with metrics to be computed by ragas
- # (https://docs.ragas.io/en/latest/concepts/metrics/index.html)
-
-
-args:
- llm_module_name:
- description: Module from which the LLM is imported. Defaults to langchain.llms
- type: str
- default: "langchain.chat_models"
- llm_class_name:
- description: Name of the selected llm
- type: str
- default: "ChatOpenAI"
- llm_kwargs:
- description: Arguments of the selected llm
- type: dict
- default: {"model_name":"gpt-3.5-turbo"}
diff --git a/components/evaluate_ragas/requirements.txt b/components/evaluate_ragas/requirements.txt
deleted file mode 100644
index 64c1d6905..000000000
--- a/components/evaluate_ragas/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-ragas==0.0.21
\ No newline at end of file
diff --git a/components/evaluate_ragas/src/main.py b/components/evaluate_ragas/src/main.py
deleted file mode 100644
index b37e873b2..000000000
--- a/components/evaluate_ragas/src/main.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import typing as t
-
-import pandas as pd
-from datasets import Dataset
-from fondant.component import PandasTransformComponent
-from ragas import evaluate
-from ragas.llms import LangchainLLM
-
-
-class RetrieverEval(PandasTransformComponent):
- def __init__(
- self,
- *,
- llm_module_name: str,
- llm_class_name: str,
- llm_kwargs: dict,
- produces: t.Dict[str, t.Any],
- ) -> None:
- """
- Args:
- llm_module_name: Module from which the LLM is imported. Defaults to
- langchain.chat_models
- llm_class_name: Name of the selected llm. Defaults to ChatOpenAI
- llm_kwargs: Arguments of the selected llm
- produces: RAGAS metrics to compute.
- """
- self.llm = self.extract_llm(
- llm_module_name=llm_module_name,
- llm_class_name=llm_class_name,
- llm_kwargs=llm_kwargs,
- )
- self.gpt_wrapper = LangchainLLM(llm=self.llm)
- self.metric_functions = self.extract_metric_functions(
- metrics=list(produces.keys()),
- )
- self.set_llm(self.metric_functions)
-
- # import the metric functions selected
- @staticmethod
- def import_from(module_name: str, element_name: str):
- module = __import__(module_name, fromlist=[element_name])
- return getattr(module, element_name)
-
- def extract_llm(self, llm_module_name: str, llm_class_name: str, llm_kwargs: dict):
- module = self.import_from(
- module_name=llm_module_name,
- element_name=llm_class_name,
- )
- return module(**llm_kwargs)
-
- def extract_metric_functions(self, metrics: list):
- functions = []
- for metric in metrics:
- functions.append(self.import_from("ragas.metrics", metric))
- return functions
-
- def set_llm(self, metric_functions: list):
- for metric_function in metric_functions:
- metric_function.llm = self.gpt_wrapper
-
- # evaluate the retriever
- @staticmethod
- def create_hf_ds(dataframe: pd.DataFrame):
- dataframe = dataframe.rename(
- columns={"retrieved_chunks": "contexts"},
- )
- return Dataset.from_pandas(dataframe)
-
- def ragas_eval(self, dataset):
- return evaluate(dataset=dataset, metrics=self.metric_functions)
-
- def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
- hf_dataset = self.create_hf_ds(
- dataframe=dataframe[["question", "retrieved_chunks"]],
- )
- if "id" in hf_dataset.column_names:
- hf_dataset = hf_dataset.remove_columns("id")
-
- result = self.ragas_eval(dataset=hf_dataset)
- results_df = result.to_pandas()
- results_df = results_df.set_index(dataframe.index)
-
- return results_df
diff --git a/components/evaluate_ragas/tests/component_test.py b/components/evaluate_ragas/tests/component_test.py
deleted file mode 100644
index 3a2f79be4..000000000
--- a/components/evaluate_ragas/tests/component_test.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import os
-
-import pandas as pd
-import pyarrow as pa
-from main import RetrieverEval
-
-
-def test_transform():
- input_dataframe = pd.DataFrame(
- {
- "text": [
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit?",
- "Sed massa massa, interdum a porttitor sit amet, semper eget nunc?",
- ],
- "retrieved_chunks": [
- [
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit. \
- Quisque ut efficitur neque. Aenean mollis eleifend est, \
- eu laoreet magna egestas quis. Cras id sagittis erat. \
- Aliquam vel blandit arcu. Morbi ac nulla ullamcorper, \
- rutrum neque nec, pellentesque diam. Nulla nec tempor \
- enim. Suspendisse a volutpat leo, quis varius dolor.",
- "Curabitur placerat ultrices mauris et lobortis. Maecenas \
- laoreet tristique sagittis. Integer facilisis eleifend \
- dolor, quis fringilla orci eleifend ac. Vestibulum nunc \
- odio, tincidunt ut augue et, ornare vehicula sapien. Orci \
- varius natoque penatibus et magnis dis parturient montes, \
- nascetur ridiculus mus. Sed auctor felis lacus, rutrum \
- tempus ligula viverra ac. Curabitur pharetra mauris et \
- ornare pulvinar. Suspendisse a ultricies nisl. Mauris \
- sit amet odio condimentum, venenatis orci vitae, \
- tincidunt purus. Ut ullamcorper convallis ligula ac \
- posuere. In efficitur enim ac lacus dignissim congue. \
- Nam turpis augue, aliquam et velit sit amet, varius \
- euismod ante. Duis volutpat nisl sit amet auctor tempus.\
- Vivamus in eros ex.",
- ],
- [
- "am leo massa, ultricies eu viverra ac, commodo non sapien. \
- Mauris et mauris sollicitudin, ultricies ex ac, luctus \
- nulla.",
- "Cras tincidunt facilisis mi, ac eleifend justo lobortis ut. \
- In lobortis cursus ante et faucibus. Vestibulum auctor \
- felis at odio varius, ac vulputate leo dictum. \
- Phasellus in augue ante. Aliquam aliquam mauris \
- sed tellus egestas fermentum.",
- ],
- ],
- },
- )
-
- component = RetrieverEval(
- module="langchain.llms",
- llm_name="OpenAI",
- llm_kwargs={"openai_api_key": os.environ["OPENAI_KEY"]},
- produces={
- "context_precision": pa.float32(),
- "context_relevancy": pa.float32(),
- },
- )
-
- output_dataframe = component.transform(input_dataframe)
-
- expected_output_dataframe = pd.DataFrame(
- {
- "question": [
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit?",
- "Sed massa massa, interdum a porttitor sit amet, semper eget nunc?",
- ],
- "contexts": [
- [
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit. \
- Quisque ut efficitur neque. Aenean mollis eleifend est, \
- eu laoreet magna egestas quis. Cras id sagittis erat. \
- Aliquam vel blandit arcu. Morbi ac nulla ullamcorper, \
- rutrum neque nec, pellentesque diam. Nulla nec tempor \
- enim. Suspendisse a volutpat leo, quis varius dolor.",
- "Curabitur placerat ultrices mauris et lobortis. Maecenas \
- laoreet tristique sagittis. Integer facilisis eleifend \
- dolor, quis fringilla orci eleifend ac. Vestibulum nunc \
- odio, tincidunt ut augue et, ornare vehicula sapien. Orci \
- varius natoque penatibus et magnis dis parturient montes, \
- nascetur ridiculus mus. Sed auctor felis lacus, rutrum \
- tempus ligula viverra ac. Curabitur pharetra mauris et \
- ornare pulvinar. Suspendisse a ultricies nisl. Mauris \
- sit amet odio condimentum, venenatis orci vitae, \
- tincidunt purus. Ut ullamcorper convallis ligula ac \
- posuere. In efficitur enim ac lacus dignissim congue. \
- Nam turpis augue, aliquam et velit sit amet, varius \
- euismod ante. Duis volutpat nisl sit amet auctor tempus.\
- Vivamus in eros ex.",
- ],
- [
- "am leo massa, ultricies eu viverra ac, commodo non sapien. \
- Mauris et mauris sollicitudin, ultricies ex ac, luctus \
- nulla.",
- "Cras tincidunt facilisis mi, ac eleifend justo lobortis ut. \
- In lobortis cursus ante et faucibus. Vestibulum auctor \
- felis at odio varius, ac vulputate leo dictum. \
- Phasellus in augue ante. Aliquam aliquam mauris \
- sed tellus egestas fermentum.",
- ],
- ],
- "context_precision": 0.15,
- "context_relevancy": 0.35,
- },
- )
-
- # Check if columns are the same
- columns_equal = expected_output_dataframe.columns.equals(output_dataframe.columns)
-
- # Check if data types within each column match
- dtypes_match = expected_output_dataframe.dtypes.equals(output_dataframe.dtypes)
-
- # Check if both conditions are met
- assert columns_equal
- assert dtypes_match
diff --git a/components/evaluate_ragas/tests/pytest.ini b/components/evaluate_ragas/tests/pytest.ini
deleted file mode 100644
index bf6a8a517..000000000
--- a/components/evaluate_ragas/tests/pytest.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-[pytest]
-pythonpath = ../src
\ No newline at end of file
diff --git a/components/evaluate_ragas/tests/requirements.txt b/components/evaluate_ragas/tests/requirements.txt
deleted file mode 100644
index de1887bec..000000000
--- a/components/evaluate_ragas/tests/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-pytest==7.4.2
\ No newline at end of file
diff --git a/components/load_with_llamahub/Dockerfile b/components/load_with_llamahub/Dockerfile
deleted file mode 100644
index 5de6e945f..000000000
--- a/components/load_with_llamahub/Dockerfile
+++ /dev/null
@@ -1,29 +0,0 @@
-FROM --platform=linux/amd64 python:3.8-slim as base
-
-# System dependencies
-RUN apt-get update && \
- apt-get upgrade -y && \
- apt-get install git -y
-
-# Install requirements
-COPY requirements.txt /
-RUN pip3 install --no-cache-dir -r requirements.txt
-
-# Install Fondant
-# This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
-RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
-
-# Set the working directory to the component folder
-WORKDIR /component
-COPY src/ src/
-
-FROM base as test
-COPY tests/ tests/
-RUN pip3 install --no-cache-dir -r tests/requirements.txt
-RUN python -m pytest tests
-
-FROM base
-WORKDIR /component/src
-ENTRYPOINT ["fondant", "execute", "main"]
-
diff --git a/components/load_with_llamahub/README.md b/components/load_with_llamahub/README.md
deleted file mode 100644
index 5c3aa1320..000000000
--- a/components/load_with_llamahub/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Load with LlamaHub
-
-
-## Description
-Load data using a LlamaHub loader. For available loaders, check the
-[LlamaHub](https://llamahub.ai/).
-
-
-
-## Inputs / outputs
-
-
-### Consumes
-
-
-**This component does not consume data.**
-
-
-
-### Produces
-
-**This component can produce additional fields**
-- :
-This defines a mapping to update the fields produced by the operation as defined in the component spec.
-The keys are the names of the fields to be produced by the component, while the values are
-the type of the field that should be used to write the output dataset.
-
-
-
-## Arguments
-
-The component takes the following arguments to alter its behavior:
-
-| argument | type | description | default |
-| -------- | ---- | ----------- | ------- |
-| loader_class | str | The name of the LlamaIndex loader class to use. Make sure to provide the name and not the id. The name is passed to `llama_index.download_loader` to download the specified loader. | / |
-| loader_kwargs | str | Keyword arguments to pass when instantiating the loader class. Check the documentation of the loader to check which arguments it accepts. | / |
-| load_kwargs | str | Keyword arguments to pass to the `.load()` method of the loader. Check the documentation ofthe loader to check which arguments it accepts. | / |
-| additional_requirements | list | Some loaders require additional dependencies to be installed. You can specify those here. Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately additional requirements for LlamaIndex loaders are not documented well, but if a dependencyis missing, a clear error message will be thrown. | / |
-| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / |
-| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / |
-
-
-## Usage
-
-You can add this component to your pipeline using the following code:
-
-```python
-from fondant.pipeline import Pipeline
-
-
-pipeline = Pipeline(...)
-
-dataset = pipeline.read(
- "load_with_llamahub",
- arguments={
- # Add arguments
- # "loader_class": ,
- # "loader_kwargs": ,
- # "load_kwargs": ,
- # "additional_requirements": [],
- # "n_rows_to_load": 0,
- # "index_column": ,
- },
- produces={
- : ,
- ..., # Add fields
- },
-)
-```
-
-
-## Testing
-
-You can run the tests using docker with BuildKit. From this directory, run:
-```
-docker build . --target test
-```
diff --git a/components/load_with_llamahub/fondant_component.yaml b/components/load_with_llamahub/fondant_component.yaml
deleted file mode 100644
index ca16ff794..000000000
--- a/components/load_with_llamahub/fondant_component.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-name: Load with LlamaHub
-description: |
- Load data using a LlamaHub loader. For available loaders, check the
- [LlamaHub](https://llamahub.ai/).
-image: fndnt/load_with_llamahub:dev
-tags:
- - Data loading
-
-produces:
- additionalProperties: true
-
-args:
- loader_class:
- description: |
- The name of the LlamaIndex loader class to use. Make sure to provide the name and not the
- id. The name is passed to `llama_index.download_loader` to download the specified loader.
- type: str
- loader_kwargs:
- description: |
- Keyword arguments to pass when instantiating the loader class. Check the documentation of
- the loader to check which arguments it accepts.
- type: str
- load_kwargs:
- description: |
- Keyword arguments to pass to the `.load()` method of the loader. Check the documentation of
- the loader to check which arguments it accepts.
- type: str
- additional_requirements:
- description: |
- Some loaders require additional dependencies to be installed. You can specify those here.
- Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately
- additional requirements for LlamaIndex loaders are not documented well, but if a dependency
- is missing, a clear error message will be thrown.
- type: list
- default: []
- n_rows_to_load:
- description: |
- Optional argument that defines the number of rows to load. Useful for testing pipeline runs
- on a small scale
- type: int
- default: None
- index_column:
- description: |
- Column to set index to in the load component, if not specified a default globally unique
- index will be set
- type: str
- default: None
diff --git a/components/load_with_llamahub/requirements.txt b/components/load_with_llamahub/requirements.txt
deleted file mode 100644
index 3a7971f8f..000000000
--- a/components/load_with_llamahub/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-llama-index==0.9.9
diff --git a/components/load_with_llamahub/src/main.py b/components/load_with_llamahub/src/main.py
deleted file mode 100644
index 8be99f096..000000000
--- a/components/load_with_llamahub/src/main.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import logging
-import subprocess
-import sys
-import typing as t
-from collections import defaultdict
-
-import dask.dataframe as dd
-import pandas as pd
-from fondant.component import DaskLoadComponent
-from fondant.core.component_spec import OperationSpec
-from llama_index import download_loader
-
-logger = logging.getLogger(__name__)
-
-
-class LlamaHubReader(DaskLoadComponent):
- def __init__(
- self,
- spec: OperationSpec,
- *,
- loader_class: str,
- loader_kwargs: dict,
- load_kwargs: dict,
- additional_requirements: t.List[str],
- n_rows_to_load: t.Optional[int] = None,
- index_column: t.Optional[str] = None,
- ) -> None:
- """
- Args:
- spec: the component spec
- loader_class: The name of the LlamaIndex loader class to use
- loader_kwargs: Keyword arguments to pass when instantiating the loader class
- load_kwargs: Keyword arguments to pass to the `.load()` method of the loader
- additional_requirements: Additional Python requirements to install
- n_rows_to_load: optional argument that defines the number of rows to load.
- Useful for testing pipeline runs on a small scale.
- index_column: Column to set index to in the load component, if not specified a default
- globally unique index will be set.
- """
- self.n_rows_to_load = n_rows_to_load
- self.index_column = index_column
- self.spec = spec
-
- self.install_additional_requirements(additional_requirements)
-
- loader_cls = download_loader(loader_class)
- self.loader = loader_cls(**loader_kwargs)
- self.load_kwargs = load_kwargs
-
- @staticmethod
- def install_additional_requirements(additional_requirements: t.List[str]):
- for requirement in additional_requirements:
- subprocess.check_call( # nosec
- [sys.executable, "-m", "pip", "install", requirement],
- )
-
- def set_df_index(self, dask_df: dd.DataFrame) -> dd.DataFrame:
- if self.index_column is None:
- logger.info(
- "Index column not specified, setting a globally unique index",
- )
-
- def _set_unique_index(dataframe: pd.DataFrame, partition_info=None):
- """Function that sets a unique index based on the partition and row number."""
- dataframe["id"] = 1
- dataframe["id"] = (
- str(partition_info["number"])
- + "_"
- + (dataframe.id.cumsum()).astype(str)
- )
- dataframe.index = dataframe.pop("id")
- return dataframe
-
- def _get_meta_df() -> pd.DataFrame:
- meta_dict = {"id": pd.Series(dtype="object")}
- for field_name, field in self.spec.inner_produces.items():
- meta_dict[field_name] = pd.Series(
- dtype=pd.ArrowDtype(field.type.value),
- )
- return pd.DataFrame(meta_dict).set_index("id")
-
- meta = _get_meta_df()
- dask_df = dask_df.map_partitions(_set_unique_index, meta=meta)
- else:
- logger.info(f"Setting `{self.index_column}` as index")
- dask_df = dask_df.set_index(self.index_column, drop=True)
-
- return dask_df
-
- def load(self) -> dd.DataFrame:
- try:
- documents = self.loader.lazy_load_data(**self.load_kwargs)
- except NotImplementedError:
- documents = self.loader.load_data(**self.load_kwargs)
-
- doc_dict = defaultdict(list)
- for d, document in enumerate(documents):
- for column in self.spec.inner_produces:
- if column == "text":
- doc_dict["text"].append(document.text)
- else:
- doc_dict[column].append(document.metadata.get(column))
-
- if d == self.n_rows_to_load:
- break
-
- dask_df = dd.from_dict(doc_dict, npartitions=1)
-
- dask_df = self.set_df_index(dask_df)
- return dask_df
diff --git a/components/load_with_llamahub/tests/component_test.py b/components/load_with_llamahub/tests/component_test.py
deleted file mode 100644
index 217b42281..000000000
--- a/components/load_with_llamahub/tests/component_test.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from pathlib import Path
-
-import yaml
-from fondant.core.component_spec import ComponentSpec
-
-from src.main import LlamaHubReader
-
-
-def test_arxiv_reader():
- """Test the component with the ArxivReader.
-
- This test requires a stable internet connection, both to download the loader, and to download
- the papers from Arxiv.
- """
- with open(Path(__file__).with_name("fondant_component.yaml")) as f:
- spec = yaml.safe_load(f)
- spec = ComponentSpec(spec)
-
- component = LlamaHubReader(
- spec=spec,
- loader_class="ArxivReader",
- loader_kwargs={},
- load_kwargs={
- "search_query": "jeff dean",
- "max_results": 5,
- },
- additional_requirements=["pypdf"],
- n_rows_to_load=None,
- index_column=None,
- )
-
- output_dataframe = component.load().compute()
-
- assert len(output_dataframe) > 0
- assert output_dataframe.columns.tolist() == ["text", "URL", "Title of this paper"]
diff --git a/components/load_with_llamahub/tests/fondant_component.yaml b/components/load_with_llamahub/tests/fondant_component.yaml
deleted file mode 100644
index b0f34786f..000000000
--- a/components/load_with_llamahub/tests/fondant_component.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Load with LlamaHub
-description: |
- Load data using a LlamaHub loader. For available loaders, check the
- [LlamaHub](https://llamahub.ai/).
-image: ghcr.io/ml6team/load_with_llamahub:dev
-
-produces:
- text:
- type: string
- URL:
- type: string
- Title of this paper:
- type: string
-
-args:
- loader_class:
- description: |
- The name of the LlamaIndex loader class to use. Make sure to provide the name and not the
- id. The name is passed to `llama_index.download_loader` to download the specified loader.
- type: str
- loader_kwargs:
- description: |
- Keyword arguments to pass when instantiating the loader class. Check the documentation of
- the loader to check which arguments it accepts.
- type: str
- load_kwargs:
- description: |
- Keyword arguments to pass to the `.load()` method of the loader. Check the documentation of
- the loader to check which arguments it accepts.
- type: str
- additional_requirements:
- description: |
- Some loaders require additional dependencies to be installed. You can specify those here.
- Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately
- additional requirements for LlamaIndex loaders are not documented well, but if a dependency
- is missing, a clear error message will be thrown.
- type: list
- default: []
- n_rows_to_load:
- description: |
- Optional argument that defines the number of rows to load. Useful for testing pipeline runs
- on a small scale
- type: int
- default: None
- index_column:
- description: |
- Column to set index to in the load component, if not specified a default globally unique
- index will be set
- type: str
- default: None
diff --git a/components/load_with_llamahub/tests/pytest.ini b/components/load_with_llamahub/tests/pytest.ini
deleted file mode 100644
index bf6a8a517..000000000
--- a/components/load_with_llamahub/tests/pytest.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-[pytest]
-pythonpath = ../src
\ No newline at end of file
diff --git a/components/load_with_llamahub/tests/requirements.txt b/components/load_with_llamahub/tests/requirements.txt
deleted file mode 100644
index 2a929edcc..000000000
--- a/components/load_with_llamahub/tests/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-pytest==7.4.2
diff --git a/components/normalize_text/Dockerfile b/components/normalize_text/Dockerfile
deleted file mode 100644
index c1e64c082..000000000
--- a/components/normalize_text/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM --platform=linux/amd64 python:3.8-slim as base
-
-# System dependencies
-RUN apt-get update && \
- apt-get upgrade -y && \
- apt-get install git -y
-
-# Install requirements
-COPY requirements.txt /
-RUN pip3 install --no-cache-dir -r requirements.txt
-
-# Install Fondant
-# This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
-RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
-
-# Set the working directory to the component folder
-WORKDIR /component
-COPY src/ src/
-
-FROM base as test
-COPY tests/ tests/
-RUN pip3 install --no-cache-dir -r tests/requirements.txt
-RUN python -m pytest tests
-
-FROM base
-WORKDIR /component/src
-ENTRYPOINT ["fondant", "execute", "main"]
diff --git a/components/normalize_text/README.md b/components/normalize_text/README.md
deleted file mode 100644
index 9de782516..000000000
--- a/components/normalize_text/README.md
+++ /dev/null
@@ -1,84 +0,0 @@
-# Normalize text
-
-
-## Description
-This component implements several text normalization techniques to clean and preprocess textual
-data:
-
-- Apply lowercasing: Converts all text to lowercase
-- Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs
-- Apply NFC normalization: Converts characters to their canonical representation
-- Remove common seen patterns in webpages following the implementation of
- [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf)
-- Remove punctuation: Strips punctuation marks from the text
-
-These text normalization techniques are valuable for preparing text data before using it for
-the training of large language models.
-
-
-
-## Inputs / outputs
-
-
-### Consumes
-**This component consumes:**
-
-- text: string
-
-
-
-
-
-### Produces
-**This component produces:**
-
-- text: string
-
-
-
-
-## Arguments
-
-The component takes the following arguments to alter its behavior:
-
-| argument | type | description | default |
-| -------- | ---- | ----------- | ------- |
-| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | / |
-| apply_nfc | bool | If true apply nfc normalization | / |
-| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | / |
-| do_lowercase | bool | If true apply lowercasing | / |
-| remove_punctuation | str | If true punctuation will be removed | / |
-
-
-## Usage
-
-You can add this component to your pipeline using the following code:
-
-```python
-from fondant.pipeline import Pipeline
-
-
-pipeline = Pipeline(...)
-
-dataset = pipeline.read(...)
-
-dataset = dataset.apply(
- "normalize_text",
- arguments={
- # Add arguments
- # "remove_additional_whitespaces": False,
- # "apply_nfc": False,
- # "normalize_lines": False,
- # "do_lowercase": False,
- # "remove_punctuation": ,
- },
-)
-```
-
-
-## Testing
-
-You can run the tests using docker with BuildKit. From this directory, run:
-```
-docker build . --target test
-```
diff --git a/components/normalize_text/fondant_component.yaml b/components/normalize_text/fondant_component.yaml
deleted file mode 100644
index 35b6c79de..000000000
--- a/components/normalize_text/fondant_component.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: Normalize text
-image: fndnt/normalize_text:latest
-description: |
- This component implements several text normalization techniques to clean and preprocess textual
- data:
-
- - Apply lowercasing: Converts all text to lowercase
- - Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs
- - Apply NFC normalization: Converts characters to their canonical representation
- - Remove common seen patterns in webpages following the implementation of
- [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf)
- - Remove punctuation: Strips punctuation marks from the text
-
- These text normalization techniques are valuable for preparing text data before using it for
- the training of large language models.
-tags:
- - Text processing
-
-consumes:
- text:
- type: string
-
-produces:
- text:
- type: string
-
-args:
- remove_additional_whitespaces:
- description: If true remove all additional whitespace, tabs.
- type: bool
- apply_nfc:
- description: If true apply nfc normalization
- type: bool
- normalize_lines:
- description: If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter
- type: bool
- do_lowercase:
- description: If true apply lowercasing
- type: bool
- remove_punctuation:
- description: If true punctuation will be removed
- type: str
\ No newline at end of file
diff --git a/components/normalize_text/requirements.txt b/components/normalize_text/requirements.txt
deleted file mode 100644
index a4299def8..000000000
--- a/components/normalize_text/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-ftfy==6.1.1
\ No newline at end of file
diff --git a/components/normalize_text/src/main.py b/components/normalize_text/src/main.py
deleted file mode 100644
index cda2939cf..000000000
--- a/components/normalize_text/src/main.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""A component that normalizes text."""
-import logging
-import re
-import string
-from typing import List
-
-import ftfy
-import pandas as pd
-from fondant.component import PandasTransformComponent
-from utils import is_counter, is_one_word, mainly_uppercase, only_numerical
-
-logger = logging.getLogger(__name__)
-
-
-def _remove_punctuation(text):
- """Remove punctuation in given text."""
- return text.translate(str.maketrans("", "", string.punctuation))
-
-
-def _remove_additional_whitespaces(text):
- """
- Text cleaning method from slimpajama approach.
- https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py
- Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle
- and in the beginning / end.
- """
- return re.sub(r"\s+", " ", text.strip())
-
-
-def normalize_lines(text):
- def any_condition_met(line, discard_condition_functions):
- return any(condition(line) for condition in discard_condition_functions)
-
- discard_conditions = [mainly_uppercase, only_numerical, is_counter, is_one_word]
- return " ".join(
- [
- line
- for line in text.split("\n")
- if not any_condition_met(line, discard_conditions)
- ],
- )
-
-
-class NormalizeTextComponent(PandasTransformComponent):
- """Component that normalizes text."""
-
- def __init__(
- self,
- *,
- remove_additional_whitespaces: bool,
- apply_nfc: bool,
- normalize_lines: bool,
- do_lowercase: bool,
- remove_punctuation: bool,
- ):
- self.remove_additional_whitespaces = remove_additional_whitespaces
- self.apply_nfc = apply_nfc
- self.normalize_lines = normalize_lines
- self.do_lowercase = do_lowercase
- self.remove_punctuation = remove_punctuation
-
- @staticmethod
- def _do_nfc_normalization(text: str):
- """Apply nfc normalization to the text of the dataframe."""
- return ftfy.fix_text(text, normalization="NFC")
-
- @staticmethod
- def _remove_patterns(regex_patterns: List[str], text: str):
- """Remove each regex pattern in the provided string."""
- for pattern in regex_patterns:
- text = re.sub(pattern, "", text)
- return text
-
- def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
- """
- Apply normalization transformations. The component is capable of:
- - NFC normalization
- - Lowercasing
- - Removing of unnecessary whitespaces (e.g. tabs), punctuation
- - Apply line-wise transformations that exclude lines matching specified patterns.
- Patterns include lines that are mainly composed of uppercase characters, lines that consist
- only of numerical characters, lines that are counters (e.g., "3 likes"), and lines
- that contain only one word.
-
- Args:
- dataframe: Pandas dataframe.
-
- Returns:
- Pandas dataframe
- """
- if self.normalize_lines:
- dataframe["text"] = dataframe["text"].apply(
- normalize_lines,
- )
-
- if self.do_lowercase:
- dataframe["text"] = dataframe["text"].apply(
- lambda x: x.lower(),
- )
-
- if self.apply_nfc:
- dataframe["text"] = dataframe["text"].apply(
- self._do_nfc_normalization,
- )
-
- if self.remove_punctuation:
- dataframe["text"] = dataframe["text"].apply(
- _remove_punctuation,
- )
-
- if self.remove_additional_whitespaces:
- dataframe["text"] = dataframe["text"].apply(
- _remove_additional_whitespaces,
- )
-
- # remove all empty rows
- dataframe = dataframe[dataframe["text"].astype(bool)]
-
- return dataframe
diff --git a/components/normalize_text/src/utils.py b/components/normalize_text/src/utils.py
deleted file mode 100644
index b487bc61e..000000000
--- a/components/normalize_text/src/utils.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import re
-
-
-def mainly_uppercase(line: str, threshold: float = 0.7) -> bool:
- """
- Checks if a line is mainly composed of uppercase characters.
-
- Args:
- line: The input line to check.
- threshold (float): The threshold (between 0 and 1) to determine what is considered
- "mainly uppercase."
-
- Returns:
- bool: True if the line is mainly uppercase, False otherwise.
- """
- uppercase_count = sum(1 for char in line if char.isupper())
- total_chars = len(line)
- if total_chars == 0:
- return False
-
- uppercase_ratio = uppercase_count / total_chars
- return uppercase_ratio >= threshold
-
-
-def only_numerical(line: str) -> bool:
- """
- Checks if a line is composed only of numerical characters.
-
- Args:
- line: The input line to check.
-
- Returns:
- bool: True if the line is only composed of numerical characters, False otherwise.
- """
- return line.isdigit()
-
-
-def is_counter(line: str) -> bool:
- """
- Checks if a line represents a counter (e.g., "3 likes").
-
- Args:
- line: The input line to check.
-
- Returns:
- bool: True if the line represents a counter, False otherwise.
- """
- # Use regular expression to check for the pattern:
- line = line.strip()
- pattern = r"^\d+\s+\S+$"
- return re.match(pattern, line) is not None
-
-
-def is_one_word(line: str) -> bool:
- """
- Checks if a line contains only one word.
-
- Args:
- line: The input line to check.
-
- Returns:
- bool: True if the line contains only one word, False otherwise.
- """
- words = line.split()
- return len(words) == 1
diff --git a/components/normalize_text/tests/component_test.py b/components/normalize_text/tests/component_test.py
deleted file mode 100644
index d7f52967e..000000000
--- a/components/normalize_text/tests/component_test.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import pandas as pd
-
-from src.main import NormalizeTextComponent
-
-
-def test_transform_custom_componen_test():
- """Test components transform method."""
- user_arguments = {
- "remove_additional_whitespaces": True,
- "apply_nfc": True,
- "normalize_lines": True,
- "do_lowercase": True,
- "remove_punctuation": True,
- }
- component = NormalizeTextComponent(**user_arguments)
-
- input_dataframe = pd.DataFrame(
- [
- "\u0043\u0327 something",
- "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
- "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.",
- ],
- columns=["text"],
- )
-
- expected_output = pd.DataFrame(
- [
- "\u00e7 something",
- "lorem ipsum dolor sit amet consectetur adipiscing elit",
- "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus",
- ],
- columns=["text"],
- )
-
- output_dataframe = component.transform(input_dataframe)
-
- pd.testing.assert_frame_equal(
- left=expected_output,
- right=output_dataframe,
- check_dtype=False,
- )
diff --git a/components/normalize_text/tests/pytest.ini b/components/normalize_text/tests/pytest.ini
deleted file mode 100644
index bf6a8a517..000000000
--- a/components/normalize_text/tests/pytest.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-[pytest]
-pythonpath = ../src
\ No newline at end of file
diff --git a/components/normalize_text/tests/requirements.txt b/components/normalize_text/tests/requirements.txt
deleted file mode 100644
index 6950eb5a7..000000000
--- a/components/normalize_text/tests/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-pytest==7.4.0
\ No newline at end of file
diff --git a/components/normalize_text/tests/utils_test.py b/components/normalize_text/tests/utils_test.py
deleted file mode 100644
index 8a0892bb2..000000000
--- a/components/normalize_text/tests/utils_test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from src.utils import (
- is_counter,
- is_one_word,
- mainly_uppercase,
- only_numerical,
-)
-
-
-def test_mainly_uppercase():
- line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
- assert mainly_uppercase(line, threshold=0.5)
-
-
-def test_mainly_uppercase_under_threshold():
- line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE"
- assert not mainly_uppercase(line, threshold=0.9)
-
-
-def test_only_numerical():
- line = "42"
- assert only_numerical(line)
-
-
-def test_only_numerical_on_words():
- line = "42 lorem ipsum"
- assert not only_numerical(line)
-
-
-def test_is_counter():
- line = "13 Likes"
- assert is_counter(line)
-
-
-def test_is_not_counter():
- line = "Hello world! 42 people are part of .."
- assert not is_counter(line)
-
-
-def test_is_one_word():
- line = "word"
- assert is_one_word(line)
-
-
-def test_is_not_one_word():
- line = "two words"
- assert not is_one_word(line)
diff --git a/components/retrieve_from_weaviate/Dockerfile b/components/retrieve_from_weaviate/Dockerfile
deleted file mode 100644
index 5de6e945f..000000000
--- a/components/retrieve_from_weaviate/Dockerfile
+++ /dev/null
@@ -1,29 +0,0 @@
-FROM --platform=linux/amd64 python:3.8-slim as base
-
-# System dependencies
-RUN apt-get update && \
- apt-get upgrade -y && \
- apt-get install git -y
-
-# Install requirements
-COPY requirements.txt /
-RUN pip3 install --no-cache-dir -r requirements.txt
-
-# Install Fondant
-# This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
-RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
-
-# Set the working directory to the component folder
-WORKDIR /component
-COPY src/ src/
-
-FROM base as test
-COPY tests/ tests/
-RUN pip3 install --no-cache-dir -r tests/requirements.txt
-RUN python -m pytest tests
-
-FROM base
-WORKDIR /component/src
-ENTRYPOINT ["fondant", "execute", "main"]
-
diff --git a/components/retrieve_from_weaviate/README.md b/components/retrieve_from_weaviate/README.md
deleted file mode 100644
index e5c3337ce..000000000
--- a/components/retrieve_from_weaviate/README.md
+++ /dev/null
@@ -1,206 +0,0 @@
-# retrieve_from_weaviate
-
-
-## Description
-Component that retrieves chunks from a Weaviate vector database.
-The component can retrieve chunks based on a text search or based on a vector search.
-Reranking is only supported for text search.
-More info here [Cohere Ranking](https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb)
-[Weaviate Search Rerank](https://weaviate.io/developers/weaviate/search/rerank)
-
-### Running with text as input
-
-```python
-import pyarrow as pa
-from fondant.pipeline import Pipeline
-
-pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline")
-
-dataset = pipeline.read(
- "load_from_csv",
- arguments={
- "dataset_uri": "path/to/dataset.csv",
- },
- produces={
- "text": pa.string(),
- }
-)
-
-dataset = dataset.apply(
- "index_weaviate",
- arguments={
- "weaviate_url": "http://localhost:8080",
- "class_name": "my_class",
- "vectorizer": "text2vec-openai",
- "additional_headers": {
- "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
- }
- },
- consumes={
- "text": "text"
- }
-)
-
-dataset = dataset.apply(
- "retrieve_from_weaviate",
- arguments={
- "weaviate_url": "http://localhost:8080",
- "class_name": "my_class",
- "top_k": 3,
- "additional_headers": {
- "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
- }
- },
- consumes={
- "text": "text"
- }
-)
-```
-
-```python
-import pyarrow as pa
-from fondant.pipeline import Pipeline
-
-pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline")
-
-dataset = pipeline.read(
- "load_from_csv",
- arguments={
- "dataset_uri": "path/to/dataset.csv",
- },
- produces={
- "text": pa.string(),
- }
-)
-
-dataset = dataset.apply(
- "embed_text",
- arguments={...},
- consumes={
- "text": "text",
- },
-)
-
-dataset = dataset.apply(
- "index_weaviate",
- arguments={
- "weaviate_url": "http://localhost:8080",
- "class_name": "my_class",
- },
- consumes={
- "embedding": "embedding"
- }
-)
-
-dataset = pipeline.read(
- "load_from_csv",
- arguments={
- "dataset_uri": "path/to/prompt_dataset.csv",
- },
- produces={
- "prompts": pa.string(),
- }
-)
-
-dataset = dataset.apply(
- "embed_text",
- arguments={...},
- consumes={
- "prompts": "text",
- },
-)
-
-dataset = dataset.apply(
- "retrieve_from_weaviate",
- arguments={
- "weaviate_url": "http://localhost:8080",
- "class_name": "my_class",
- "top_k": 3,
- consumes={
- "text": "text"
- }
-)
-```
-
-
-
-## Inputs / outputs
-
-
-### Consumes
-
-**This component can consume additional fields**
-- :
-This defines a mapping to update the fields consumed by the operation as defined in the component spec.
-The keys are the names of the fields to be received by the component, while the values are
-the name of the field to map from the input dataset
-
-See the usage example below on how to define a field name for additional fields.
-
-
-
-
-
-### Produces
-**This component produces:**
-
-- retrieved_chunks: list
-
-
-
-
-## Arguments
-
-The component takes the following arguments to alter its behavior:
-
-| argument | type | description | default |
-| -------- | ---- | ----------- | ------- |
-| weaviate_url | str | The URL of the weaviate instance. | http://localhost:8080 |
-| class_name | str | The name of the weaviate class that will be queried | / |
-| top_k | int | Number of chunks to retrieve | / |
-| additional_config | dict | Additional configuration to pass to the weaviate client. | / |
-| additional_headers | dict | Additional headers to pass to the weaviate client. | / |
-| hybrid_query | str | The hybrid query to be used for retrieval. Optional parameter. | / |
-| hybrid_alpha | float | Argument to change how much each search affects the results. An alpha of 1 is a pure vector search. An alpha of 0 is a pure keyword search. | / |
-| rerank | bool | Whether to rerank the results based on the hybrid query. Defaults to False.Check this notebook for more information on reranking:https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynbhttps://weaviate.io/developers/weaviate/search/rerank. | / |
-
-
-## Usage
-
-You can add this component to your pipeline using the following code:
-
-```python
-from fondant.pipeline import Pipeline
-
-
-pipeline = Pipeline(...)
-
-dataset = pipeline.read(...)
-
-dataset = dataset.apply(
- "retrieve_from_weaviate",
- arguments={
- # Add arguments
- # "weaviate_url": "http://localhost:8080",
- # "class_name": ,
- # "top_k": 0,
- # "additional_config": {},
- # "additional_headers": {},
- # "hybrid_query": ,
- # "hybrid_alpha": 0.0,
- # "rerank": False,
- },
- consumes={
- : ,
- ..., # Add fields
- },
-)
-```
-
-
-## Testing
-
-You can run the tests using docker with BuildKit. From this directory, run:
-```
-docker build . --target test
-```
diff --git a/components/retrieve_from_weaviate/fondant_component.yaml b/components/retrieve_from_weaviate/fondant_component.yaml
deleted file mode 100644
index 7b082b37b..000000000
--- a/components/retrieve_from_weaviate/fondant_component.yaml
+++ /dev/null
@@ -1,175 +0,0 @@
-name: retrieve_from_weaviate
-description: |
- Component that retrieves chunks from a Weaviate vector database.
- The component can retrieve chunks based on a text search or based on a vector search.
- Reranking is only supported for text search.
- More info here [Cohere Ranking](https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb)
- [Weaviate Search Rerank](https://weaviate.io/developers/weaviate/search/rerank)
-
- ### Running with text as input
-
- ```python
- import pyarrow as pa
- from fondant.pipeline import Pipeline
-
- pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline")
-
- dataset = pipeline.read(
- "load_from_csv",
- arguments={
- "dataset_uri": "path/to/dataset.csv",
- },
- produces={
- "text": pa.string(),
- }
- )
-
- dataset = dataset.apply(
- "index_weaviate",
- arguments={
- "weaviate_url": "http://localhost:8080",
- "class_name": "my_class",
- "vectorizer": "text2vec-openai",
- "additional_headers": {
- "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
- }
- },
- consumes={
- "text": "text"
- }
- )
-
- dataset = dataset.apply(
- "retrieve_from_weaviate",
- arguments={
- "weaviate_url": "http://localhost:8080",
- "class_name": "my_class",
- "top_k": 3,
- "additional_headers": {
- "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
- }
- },
- consumes={
- "text": "text"
- }
- )
- ```
-
- ```python
- import pyarrow as pa
- from fondant.pipeline import Pipeline
-
- pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline")
-
- dataset = pipeline.read(
- "load_from_csv",
- arguments={
- "dataset_uri": "path/to/dataset.csv",
- },
- produces={
- "text": pa.string(),
- }
- )
-
- dataset = dataset.apply(
- "embed_text",
- arguments={...},
- consumes={
- "text": "text",
- },
- )
-
- dataset = dataset.apply(
- "index_weaviate",
- arguments={
- "weaviate_url": "http://localhost:8080",
- "class_name": "my_class",
- },
- consumes={
- "embedding": "embedding"
- }
- )
-
- dataset = pipeline.read(
- "load_from_csv",
- arguments={
- "dataset_uri": "path/to/prompt_dataset.csv",
- },
- produces={
- "prompts": pa.string(),
- }
- )
-
- dataset = dataset.apply(
- "embed_text",
- arguments={...},
- consumes={
- "prompts": "text",
- },
- )
-
- dataset = dataset.apply(
- "retrieve_from_weaviate",
- arguments={
- "weaviate_url": "http://localhost:8080",
- "class_name": "my_class",
- "top_k": 3,
- consumes={
- "text": "text"
- }
- )
- ```
-
-image: fndnt/retrieve_from_weaviate:dev
-tags:
- - Data retrieval
-
-consumes:
- additionalProperties: true
-
-produces:
- retrieved_chunks:
- type: array
- items:
- type: string
-
-args:
- weaviate_url:
- description: The URL of the weaviate instance.
- type: str
- default: http://localhost:8080
- class_name:
- description:
- The name of the weaviate class that will be queried
- type: str
- top_k:
- description: Number of chunks to retrieve
- type: int
- additional_config:
- description: |
- Additional configuration to pass to the weaviate client.
- type: dict
- default: {}
- additional_headers:
- description: |
- Additional headers to pass to the weaviate client.
- type: dict
- default: {}
- hybrid_query:
- description: |
- The hybrid query to be used for retrieval. Optional parameter.
- type: str
- default: None
- hybrid_alpha:
- description: |
- Argument to change how much each search affects the results. An alpha of 1 is a pure vector search. An alpha of 0 is a pure keyword search.
- type: float
- default: None
- rerank:
- description: |
- Whether to rerank the results based on the hybrid query. Defaults to False.
- Check this notebook for more information on reranking:
- https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb
- https://weaviate.io/developers/weaviate/search/rerank.
- type: bool
- default: False
\ No newline at end of file
diff --git a/components/retrieve_from_weaviate/requirements.txt b/components/retrieve_from_weaviate/requirements.txt
deleted file mode 100644
index 12e81349f..000000000
--- a/components/retrieve_from_weaviate/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-weaviate-client==3.24.1
\ No newline at end of file
diff --git a/components/retrieve_from_weaviate/src/main.py b/components/retrieve_from_weaviate/src/main.py
deleted file mode 100644
index f69f2e684..000000000
--- a/components/retrieve_from_weaviate/src/main.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import typing as t
-
-import dask.dataframe as dd
-import pandas as pd
-import weaviate
-from fondant.component import PandasTransformComponent
-
-
-class RetrieveFromWeaviateComponent(PandasTransformComponent):
- def __init__(
- self,
- *,
- weaviate_url: str,
- class_name: str,
- top_k: int,
- additional_config: t.Optional[dict],
- additional_headers: t.Optional[dict],
- hybrid_query: t.Optional[str],
- hybrid_alpha: t.Optional[float],
- rerank: bool,
- ) -> None:
- """
- Args:
- weaviate_url: An argument passed to the component.
- class_name: Name of class to query
- top_k: Amount of context to return.
- additional_config: Additional configuration passed to the weaviate client.
- additional_headers: Additional headers passed to the weaviate client.
- hybrid_query: The hybrid query to be used for retrieval. Optional parameter.
- hybrid_alpha: Argument to change how much each search affects the results. An alpha
- of 1 is a pure vector search. An alpha of 0 is a pure keyword search.
- rerank: Whether to rerank the results based on the hybrid query. Defaults to False.
- Check this notebook for more information on reranking:
- https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb
- https://weaviate.io/developers/weaviate/search/rerank.
- """
- # Initialize your component here based on the arguments
- self.client = weaviate.Client(
- url=weaviate_url,
- additional_config=additional_config if additional_config else None,
- additional_headers=additional_headers if additional_headers else None,
- )
- self.class_name = class_name
- self.k = top_k
- self.hybrid_query, self.hybrid_alpha = self.validate_hybrid_query(
- hybrid_query,
- hybrid_alpha,
- )
- self.rerank = rerank
-
- @staticmethod
- def validate_hybrid_query(
- hybrid_query: t.Optional[str],
- hybrid_alpha: t.Optional[float],
- ):
- if hybrid_query is not None and hybrid_alpha is None:
- msg = (
- "If hybrid_query is specified, hybrid_alpha must be specified as well."
- )
- raise ValueError(
- msg,
- )
-
- return hybrid_query, hybrid_alpha
-
- def validate_reranker(self, dataframe: dd.DataFrame) -> None:
- if self.rerank and "prompt" not in dataframe.columns:
- msg = (
- "If rerank is specified, dataframe must contain a 'text' column. Reranking is"
- " only supported for text data and not for embeddings."
- )
- raise ValueError(
- msg,
- )
-
- def teardown(self) -> None:
- del self.client
-
- def retrieve_chunks_from_embeddings(self, vector_query: list):
- """Get results from weaviate database."""
- query = (
- self.client.query.get(self.class_name, ["passage"])
- .with_near_vector({"vector": vector_query})
- .with_limit(self.k)
- .with_additional(["distance"])
- )
- if self.hybrid_query is not None:
- query = query.with_hybrid(query=self.hybrid_query, alpha=self.hybrid_alpha)
-
- result = query.do()
-
- result_dict = result["data"]["Get"][self.class_name]
- return [retrieved_chunk["passage"] for retrieved_chunk in result_dict]
-
- def retrieve_chunks_from_prompts(self, prompt: str):
- """Get results from weaviate database."""
- query = (
- self.client.query.get(self.class_name, ["passage"])
- .with_near_text({"concepts": [prompt]})
- .with_limit(self.k)
- )
- if self.hybrid_query is not None:
- query = query.with_hybrid(query=self.hybrid_query, alpha=self.hybrid_alpha)
-
- if self.rerank:
- query = query.with_additional(
- 'rerank(property: "passage" query: "prompt") { score }',
- )
-
- result = query.do()
-
- result_dict = result["data"]["Get"][self.class_name]
- return [retrieved_chunk["passage"] for retrieved_chunk in result_dict]
-
- def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
- self.validate_reranker(dataframe)
-
- if "embedding" in dataframe.columns:
- dataframe["retrieved_chunks"] = dataframe["embedding"].apply(
- self.retrieve_chunks_from_embeddings,
- )
-
- elif "prompt" in dataframe.columns:
- dataframe["retrieved_chunks"] = dataframe["prompt"].apply(
- self.retrieve_chunks_from_prompts,
- )
- else:
- msg = "Dataframe must contain either an 'embedding' column or a 'prompt' column."
- raise ValueError(
- msg,
- )
-
- return dataframe
diff --git a/components/retrieve_from_weaviate/tests/pytest.ini b/components/retrieve_from_weaviate/tests/pytest.ini
deleted file mode 100644
index bf6a8a517..000000000
--- a/components/retrieve_from_weaviate/tests/pytest.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-[pytest]
-pythonpath = ../src
\ No newline at end of file
diff --git a/components/retrieve_from_weaviate/tests/requirements.txt b/components/retrieve_from_weaviate/tests/requirements.txt
deleted file mode 100644
index 2a929edcc..000000000
--- a/components/retrieve_from_weaviate/tests/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-pytest==7.4.2
diff --git a/components/retrieve_from_weaviate/tests/test_component.py b/components/retrieve_from_weaviate/tests/test_component.py
deleted file mode 100644
index 7b30898f1..000000000
--- a/components/retrieve_from_weaviate/tests/test_component.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import tempfile
-
-import numpy as np
-import pandas as pd
-import weaviate
-from weaviate.embedded import EmbeddedOptions
-
-from src.main import RetrieveFromWeaviateComponent
-
-
-def set_up_instance(client):
- """Set up an embedded instance using the provided client."""
- data = [
- {
- "data_object": {
- "passage": "foo",
- },
- "vector": np.array([1.0, 2.0]),
- },
- {
- "data_object": {
- "passage": "bar",
- },
- "vector": np.array([2.0, 3.0]),
- },
- ]
-
- for entry in data:
- client.data_object.create(
- entry["data_object"],
- class_name="Test",
- vector=entry["vector"],
- )
-
- return "http://localhost:6666"
-
-
-def test_component():
- input_dataframe = pd.DataFrame.from_dict(
- {
- "id": ["1", "2"],
- "embedding": [np.array([1.0, 2.0]), np.array([2.0, 3.0])],
- },
- )
- input_dataframe = input_dataframe.set_index("id")
-
- expected_output_dataframe = pd.DataFrame.from_dict(
- {
- "id": ["1", "2"],
- "retrieved_chunks": [["foo", "bar"], ["bar", "foo"]],
- },
- )
- expected_output_dataframe = expected_output_dataframe.set_index("id")
-
- with tempfile.TemporaryDirectory() as tmpdir:
- client = weaviate.Client(
- embedded_options=EmbeddedOptions(
- persistence_data_path=tmpdir,
- ),
- )
- url = set_up_instance(client)
-
- component = RetrieveFromWeaviateComponent(
- weaviate_url=url,
- class_name="Test",
- top_k=2,
- additional_config={},
- additional_headers={},
- hybrid_query=None,
- hybrid_alpha=None,
- rerank=False,
- )
-
- output_dataframe = component.transform(input_dataframe)
-
- pd.testing.assert_frame_equal(
- left=expected_output_dataframe,
- right=output_dataframe["retrieved_chunks"].to_frame(),
- check_dtype=False,
- )