diff --git a/components/evaluate_ragas/Dockerfile b/components/evaluate_ragas/Dockerfile deleted file mode 100644 index a5c3fa17a..000000000 --- a/components/evaluate_ragas/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM --platform=linux/amd64 python:3.8-slim as base - -# System dependencies -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install git -y - -# Install requirements -COPY requirements.txt / -RUN pip3 install --no-cache-dir -r requirements.txt - -# Install Fondant -# This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main -RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} - -# Set the working directory to the component folder -WORKDIR /component -COPY src/ src/ - -FROM base as test -COPY tests/ tests/ -RUN pip3 install --no-cache-dir -r tests/requirements.txt -ARG OPENAI_KEY -ENV OPENAI_KEY=${OPENAI_KEY} -RUN python -m pytest tests - -FROM base -WORKDIR /component/src -ENTRYPOINT ["fondant", "execute", "main"] diff --git a/components/evaluate_ragas/README.md b/components/evaluate_ragas/README.md deleted file mode 100644 index fbaf13a3e..000000000 --- a/components/evaluate_ragas/README.md +++ /dev/null @@ -1,75 +0,0 @@ -# Evaluate ragas - - -## Description -Component that evaluates the retriever using RAGAS - - -## Inputs / outputs - - -### Consumes -**This component consumes:** - -- question: string -- retrieved_chunks: list - - - - - -### Produces - -**This component can produce additional fields** -- : -This defines a mapping to update the fields produced by the operation as defined in the component spec. -The keys are the names of the fields to be produced by the component, while the values are -the type of the field that should be used to write the output dataset. - - - -## Arguments - -The component takes the following arguments to alter its behavior: - -| argument | type | description | default | -| -------- | ---- | ----------- | ------- | -| llm_module_name | str | Module from which the LLM is imported. Defaults to langchain.llms | langchain.chat_models | -| llm_class_name | str | Name of the selected llm | ChatOpenAI | -| llm_kwargs | dict | Arguments of the selected llm | {'model_name': 'gpt-3.5-turbo'} | - - -## Usage - -You can add this component to your pipeline using the following code: - -```python -from fondant.pipeline import Pipeline - - -pipeline = Pipeline(...) - -dataset = pipeline.read(...) - -dataset = dataset.apply( - "evaluate_ragas", - arguments={ - # Add arguments - # "llm_module_name": "langchain.chat_models", - # "llm_class_name": "ChatOpenAI", - # "llm_kwargs": {'model_name': 'gpt-3.5-turbo'}, - }, - produces={ - : , - ..., # Add fields - }, -) -``` - - -## Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` diff --git a/components/evaluate_ragas/fondant_component.yaml b/components/evaluate_ragas/fondant_component.yaml deleted file mode 100644 index bca9d404b..000000000 --- a/components/evaluate_ragas/fondant_component.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Evaluate ragas -description: Component that evaluates the retriever using RAGAS -image: fndnt/evaluate_ragas:dev -tags: - - Text processing - -consumes: - question: - type: string - retrieved_chunks: - type: array - items: - type: string - -produces: - additionalProperties: true - # Overwrite with metrics to be computed by ragas - # (https://docs.ragas.io/en/latest/concepts/metrics/index.html) - - -args: - llm_module_name: - description: Module from which the LLM is imported. Defaults to langchain.llms - type: str - default: "langchain.chat_models" - llm_class_name: - description: Name of the selected llm - type: str - default: "ChatOpenAI" - llm_kwargs: - description: Arguments of the selected llm - type: dict - default: {"model_name":"gpt-3.5-turbo"} diff --git a/components/evaluate_ragas/requirements.txt b/components/evaluate_ragas/requirements.txt deleted file mode 100644 index 64c1d6905..000000000 --- a/components/evaluate_ragas/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -ragas==0.0.21 \ No newline at end of file diff --git a/components/evaluate_ragas/src/main.py b/components/evaluate_ragas/src/main.py deleted file mode 100644 index b37e873b2..000000000 --- a/components/evaluate_ragas/src/main.py +++ /dev/null @@ -1,83 +0,0 @@ -import typing as t - -import pandas as pd -from datasets import Dataset -from fondant.component import PandasTransformComponent -from ragas import evaluate -from ragas.llms import LangchainLLM - - -class RetrieverEval(PandasTransformComponent): - def __init__( - self, - *, - llm_module_name: str, - llm_class_name: str, - llm_kwargs: dict, - produces: t.Dict[str, t.Any], - ) -> None: - """ - Args: - llm_module_name: Module from which the LLM is imported. Defaults to - langchain.chat_models - llm_class_name: Name of the selected llm. Defaults to ChatOpenAI - llm_kwargs: Arguments of the selected llm - produces: RAGAS metrics to compute. - """ - self.llm = self.extract_llm( - llm_module_name=llm_module_name, - llm_class_name=llm_class_name, - llm_kwargs=llm_kwargs, - ) - self.gpt_wrapper = LangchainLLM(llm=self.llm) - self.metric_functions = self.extract_metric_functions( - metrics=list(produces.keys()), - ) - self.set_llm(self.metric_functions) - - # import the metric functions selected - @staticmethod - def import_from(module_name: str, element_name: str): - module = __import__(module_name, fromlist=[element_name]) - return getattr(module, element_name) - - def extract_llm(self, llm_module_name: str, llm_class_name: str, llm_kwargs: dict): - module = self.import_from( - module_name=llm_module_name, - element_name=llm_class_name, - ) - return module(**llm_kwargs) - - def extract_metric_functions(self, metrics: list): - functions = [] - for metric in metrics: - functions.append(self.import_from("ragas.metrics", metric)) - return functions - - def set_llm(self, metric_functions: list): - for metric_function in metric_functions: - metric_function.llm = self.gpt_wrapper - - # evaluate the retriever - @staticmethod - def create_hf_ds(dataframe: pd.DataFrame): - dataframe = dataframe.rename( - columns={"retrieved_chunks": "contexts"}, - ) - return Dataset.from_pandas(dataframe) - - def ragas_eval(self, dataset): - return evaluate(dataset=dataset, metrics=self.metric_functions) - - def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - hf_dataset = self.create_hf_ds( - dataframe=dataframe[["question", "retrieved_chunks"]], - ) - if "id" in hf_dataset.column_names: - hf_dataset = hf_dataset.remove_columns("id") - - result = self.ragas_eval(dataset=hf_dataset) - results_df = result.to_pandas() - results_df = results_df.set_index(dataframe.index) - - return results_df diff --git a/components/evaluate_ragas/tests/component_test.py b/components/evaluate_ragas/tests/component_test.py deleted file mode 100644 index 3a2f79be4..000000000 --- a/components/evaluate_ragas/tests/component_test.py +++ /dev/null @@ -1,117 +0,0 @@ -import os - -import pandas as pd -import pyarrow as pa -from main import RetrieverEval - - -def test_transform(): - input_dataframe = pd.DataFrame( - { - "text": [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit?", - "Sed massa massa, interdum a porttitor sit amet, semper eget nunc?", - ], - "retrieved_chunks": [ - [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. \ - Quisque ut efficitur neque. Aenean mollis eleifend est, \ - eu laoreet magna egestas quis. Cras id sagittis erat. \ - Aliquam vel blandit arcu. Morbi ac nulla ullamcorper, \ - rutrum neque nec, pellentesque diam. Nulla nec tempor \ - enim. Suspendisse a volutpat leo, quis varius dolor.", - "Curabitur placerat ultrices mauris et lobortis. Maecenas \ - laoreet tristique sagittis. Integer facilisis eleifend \ - dolor, quis fringilla orci eleifend ac. Vestibulum nunc \ - odio, tincidunt ut augue et, ornare vehicula sapien. Orci \ - varius natoque penatibus et magnis dis parturient montes, \ - nascetur ridiculus mus. Sed auctor felis lacus, rutrum \ - tempus ligula viverra ac. Curabitur pharetra mauris et \ - ornare pulvinar. Suspendisse a ultricies nisl. Mauris \ - sit amet odio condimentum, venenatis orci vitae, \ - tincidunt purus. Ut ullamcorper convallis ligula ac \ - posuere. In efficitur enim ac lacus dignissim congue. \ - Nam turpis augue, aliquam et velit sit amet, varius \ - euismod ante. Duis volutpat nisl sit amet auctor tempus.\ - Vivamus in eros ex.", - ], - [ - "am leo massa, ultricies eu viverra ac, commodo non sapien. \ - Mauris et mauris sollicitudin, ultricies ex ac, luctus \ - nulla.", - "Cras tincidunt facilisis mi, ac eleifend justo lobortis ut. \ - In lobortis cursus ante et faucibus. Vestibulum auctor \ - felis at odio varius, ac vulputate leo dictum. \ - Phasellus in augue ante. Aliquam aliquam mauris \ - sed tellus egestas fermentum.", - ], - ], - }, - ) - - component = RetrieverEval( - module="langchain.llms", - llm_name="OpenAI", - llm_kwargs={"openai_api_key": os.environ["OPENAI_KEY"]}, - produces={ - "context_precision": pa.float32(), - "context_relevancy": pa.float32(), - }, - ) - - output_dataframe = component.transform(input_dataframe) - - expected_output_dataframe = pd.DataFrame( - { - "question": [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit?", - "Sed massa massa, interdum a porttitor sit amet, semper eget nunc?", - ], - "contexts": [ - [ - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. \ - Quisque ut efficitur neque. Aenean mollis eleifend est, \ - eu laoreet magna egestas quis. Cras id sagittis erat. \ - Aliquam vel blandit arcu. Morbi ac nulla ullamcorper, \ - rutrum neque nec, pellentesque diam. Nulla nec tempor \ - enim. Suspendisse a volutpat leo, quis varius dolor.", - "Curabitur placerat ultrices mauris et lobortis. Maecenas \ - laoreet tristique sagittis. Integer facilisis eleifend \ - dolor, quis fringilla orci eleifend ac. Vestibulum nunc \ - odio, tincidunt ut augue et, ornare vehicula sapien. Orci \ - varius natoque penatibus et magnis dis parturient montes, \ - nascetur ridiculus mus. Sed auctor felis lacus, rutrum \ - tempus ligula viverra ac. Curabitur pharetra mauris et \ - ornare pulvinar. Suspendisse a ultricies nisl. Mauris \ - sit amet odio condimentum, venenatis orci vitae, \ - tincidunt purus. Ut ullamcorper convallis ligula ac \ - posuere. In efficitur enim ac lacus dignissim congue. \ - Nam turpis augue, aliquam et velit sit amet, varius \ - euismod ante. Duis volutpat nisl sit amet auctor tempus.\ - Vivamus in eros ex.", - ], - [ - "am leo massa, ultricies eu viverra ac, commodo non sapien. \ - Mauris et mauris sollicitudin, ultricies ex ac, luctus \ - nulla.", - "Cras tincidunt facilisis mi, ac eleifend justo lobortis ut. \ - In lobortis cursus ante et faucibus. Vestibulum auctor \ - felis at odio varius, ac vulputate leo dictum. \ - Phasellus in augue ante. Aliquam aliquam mauris \ - sed tellus egestas fermentum.", - ], - ], - "context_precision": 0.15, - "context_relevancy": 0.35, - }, - ) - - # Check if columns are the same - columns_equal = expected_output_dataframe.columns.equals(output_dataframe.columns) - - # Check if data types within each column match - dtypes_match = expected_output_dataframe.dtypes.equals(output_dataframe.dtypes) - - # Check if both conditions are met - assert columns_equal - assert dtypes_match diff --git a/components/evaluate_ragas/tests/pytest.ini b/components/evaluate_ragas/tests/pytest.ini deleted file mode 100644 index bf6a8a517..000000000 --- a/components/evaluate_ragas/tests/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = ../src \ No newline at end of file diff --git a/components/evaluate_ragas/tests/requirements.txt b/components/evaluate_ragas/tests/requirements.txt deleted file mode 100644 index de1887bec..000000000 --- a/components/evaluate_ragas/tests/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==7.4.2 \ No newline at end of file diff --git a/components/load_with_llamahub/Dockerfile b/components/load_with_llamahub/Dockerfile deleted file mode 100644 index 5de6e945f..000000000 --- a/components/load_with_llamahub/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -FROM --platform=linux/amd64 python:3.8-slim as base - -# System dependencies -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install git -y - -# Install requirements -COPY requirements.txt / -RUN pip3 install --no-cache-dir -r requirements.txt - -# Install Fondant -# This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main -RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} - -# Set the working directory to the component folder -WORKDIR /component -COPY src/ src/ - -FROM base as test -COPY tests/ tests/ -RUN pip3 install --no-cache-dir -r tests/requirements.txt -RUN python -m pytest tests - -FROM base -WORKDIR /component/src -ENTRYPOINT ["fondant", "execute", "main"] - diff --git a/components/load_with_llamahub/README.md b/components/load_with_llamahub/README.md deleted file mode 100644 index 5c3aa1320..000000000 --- a/components/load_with_llamahub/README.md +++ /dev/null @@ -1,78 +0,0 @@ -# Load with LlamaHub - - -## Description -Load data using a LlamaHub loader. For available loaders, check the -[LlamaHub](https://llamahub.ai/). - - - -## Inputs / outputs - - -### Consumes - - -**This component does not consume data.** - - - -### Produces - -**This component can produce additional fields** -- : -This defines a mapping to update the fields produced by the operation as defined in the component spec. -The keys are the names of the fields to be produced by the component, while the values are -the type of the field that should be used to write the output dataset. - - - -## Arguments - -The component takes the following arguments to alter its behavior: - -| argument | type | description | default | -| -------- | ---- | ----------- | ------- | -| loader_class | str | The name of the LlamaIndex loader class to use. Make sure to provide the name and not the id. The name is passed to `llama_index.download_loader` to download the specified loader. | / | -| loader_kwargs | str | Keyword arguments to pass when instantiating the loader class. Check the documentation of the loader to check which arguments it accepts. | / | -| load_kwargs | str | Keyword arguments to pass to the `.load()` method of the loader. Check the documentation ofthe loader to check which arguments it accepts. | / | -| additional_requirements | list | Some loaders require additional dependencies to be installed. You can specify those here. Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately additional requirements for LlamaIndex loaders are not documented well, but if a dependencyis missing, a clear error message will be thrown. | / | -| n_rows_to_load | int | Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | / | -| index_column | str | Column to set index to in the load component, if not specified a default globally unique index will be set | / | - - -## Usage - -You can add this component to your pipeline using the following code: - -```python -from fondant.pipeline import Pipeline - - -pipeline = Pipeline(...) - -dataset = pipeline.read( - "load_with_llamahub", - arguments={ - # Add arguments - # "loader_class": , - # "loader_kwargs": , - # "load_kwargs": , - # "additional_requirements": [], - # "n_rows_to_load": 0, - # "index_column": , - }, - produces={ - : , - ..., # Add fields - }, -) -``` - - -## Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` diff --git a/components/load_with_llamahub/fondant_component.yaml b/components/load_with_llamahub/fondant_component.yaml deleted file mode 100644 index ca16ff794..000000000 --- a/components/load_with_llamahub/fondant_component.yaml +++ /dev/null @@ -1,47 +0,0 @@ -name: Load with LlamaHub -description: | - Load data using a LlamaHub loader. For available loaders, check the - [LlamaHub](https://llamahub.ai/). -image: fndnt/load_with_llamahub:dev -tags: - - Data loading - -produces: - additionalProperties: true - -args: - loader_class: - description: | - The name of the LlamaIndex loader class to use. Make sure to provide the name and not the - id. The name is passed to `llama_index.download_loader` to download the specified loader. - type: str - loader_kwargs: - description: | - Keyword arguments to pass when instantiating the loader class. Check the documentation of - the loader to check which arguments it accepts. - type: str - load_kwargs: - description: | - Keyword arguments to pass to the `.load()` method of the loader. Check the documentation of - the loader to check which arguments it accepts. - type: str - additional_requirements: - description: | - Some loaders require additional dependencies to be installed. You can specify those here. - Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately - additional requirements for LlamaIndex loaders are not documented well, but if a dependency - is missing, a clear error message will be thrown. - type: list - default: [] - n_rows_to_load: - description: | - Optional argument that defines the number of rows to load. Useful for testing pipeline runs - on a small scale - type: int - default: None - index_column: - description: | - Column to set index to in the load component, if not specified a default globally unique - index will be set - type: str - default: None diff --git a/components/load_with_llamahub/requirements.txt b/components/load_with_llamahub/requirements.txt deleted file mode 100644 index 3a7971f8f..000000000 --- a/components/load_with_llamahub/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -llama-index==0.9.9 diff --git a/components/load_with_llamahub/src/main.py b/components/load_with_llamahub/src/main.py deleted file mode 100644 index 8be99f096..000000000 --- a/components/load_with_llamahub/src/main.py +++ /dev/null @@ -1,110 +0,0 @@ -import logging -import subprocess -import sys -import typing as t -from collections import defaultdict - -import dask.dataframe as dd -import pandas as pd -from fondant.component import DaskLoadComponent -from fondant.core.component_spec import OperationSpec -from llama_index import download_loader - -logger = logging.getLogger(__name__) - - -class LlamaHubReader(DaskLoadComponent): - def __init__( - self, - spec: OperationSpec, - *, - loader_class: str, - loader_kwargs: dict, - load_kwargs: dict, - additional_requirements: t.List[str], - n_rows_to_load: t.Optional[int] = None, - index_column: t.Optional[str] = None, - ) -> None: - """ - Args: - spec: the component spec - loader_class: The name of the LlamaIndex loader class to use - loader_kwargs: Keyword arguments to pass when instantiating the loader class - load_kwargs: Keyword arguments to pass to the `.load()` method of the loader - additional_requirements: Additional Python requirements to install - n_rows_to_load: optional argument that defines the number of rows to load. - Useful for testing pipeline runs on a small scale. - index_column: Column to set index to in the load component, if not specified a default - globally unique index will be set. - """ - self.n_rows_to_load = n_rows_to_load - self.index_column = index_column - self.spec = spec - - self.install_additional_requirements(additional_requirements) - - loader_cls = download_loader(loader_class) - self.loader = loader_cls(**loader_kwargs) - self.load_kwargs = load_kwargs - - @staticmethod - def install_additional_requirements(additional_requirements: t.List[str]): - for requirement in additional_requirements: - subprocess.check_call( # nosec - [sys.executable, "-m", "pip", "install", requirement], - ) - - def set_df_index(self, dask_df: dd.DataFrame) -> dd.DataFrame: - if self.index_column is None: - logger.info( - "Index column not specified, setting a globally unique index", - ) - - def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): - """Function that sets a unique index based on the partition and row number.""" - dataframe["id"] = 1 - dataframe["id"] = ( - str(partition_info["number"]) - + "_" - + (dataframe.id.cumsum()).astype(str) - ) - dataframe.index = dataframe.pop("id") - return dataframe - - def _get_meta_df() -> pd.DataFrame: - meta_dict = {"id": pd.Series(dtype="object")} - for field_name, field in self.spec.inner_produces.items(): - meta_dict[field_name] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) - return pd.DataFrame(meta_dict).set_index("id") - - meta = _get_meta_df() - dask_df = dask_df.map_partitions(_set_unique_index, meta=meta) - else: - logger.info(f"Setting `{self.index_column}` as index") - dask_df = dask_df.set_index(self.index_column, drop=True) - - return dask_df - - def load(self) -> dd.DataFrame: - try: - documents = self.loader.lazy_load_data(**self.load_kwargs) - except NotImplementedError: - documents = self.loader.load_data(**self.load_kwargs) - - doc_dict = defaultdict(list) - for d, document in enumerate(documents): - for column in self.spec.inner_produces: - if column == "text": - doc_dict["text"].append(document.text) - else: - doc_dict[column].append(document.metadata.get(column)) - - if d == self.n_rows_to_load: - break - - dask_df = dd.from_dict(doc_dict, npartitions=1) - - dask_df = self.set_df_index(dask_df) - return dask_df diff --git a/components/load_with_llamahub/tests/component_test.py b/components/load_with_llamahub/tests/component_test.py deleted file mode 100644 index 217b42281..000000000 --- a/components/load_with_llamahub/tests/component_test.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path - -import yaml -from fondant.core.component_spec import ComponentSpec - -from src.main import LlamaHubReader - - -def test_arxiv_reader(): - """Test the component with the ArxivReader. - - This test requires a stable internet connection, both to download the loader, and to download - the papers from Arxiv. - """ - with open(Path(__file__).with_name("fondant_component.yaml")) as f: - spec = yaml.safe_load(f) - spec = ComponentSpec(spec) - - component = LlamaHubReader( - spec=spec, - loader_class="ArxivReader", - loader_kwargs={}, - load_kwargs={ - "search_query": "jeff dean", - "max_results": 5, - }, - additional_requirements=["pypdf"], - n_rows_to_load=None, - index_column=None, - ) - - output_dataframe = component.load().compute() - - assert len(output_dataframe) > 0 - assert output_dataframe.columns.tolist() == ["text", "URL", "Title of this paper"] diff --git a/components/load_with_llamahub/tests/fondant_component.yaml b/components/load_with_llamahub/tests/fondant_component.yaml deleted file mode 100644 index b0f34786f..000000000 --- a/components/load_with_llamahub/tests/fondant_component.yaml +++ /dev/null @@ -1,50 +0,0 @@ -name: Load with LlamaHub -description: | - Load data using a LlamaHub loader. For available loaders, check the - [LlamaHub](https://llamahub.ai/). -image: ghcr.io/ml6team/load_with_llamahub:dev - -produces: - text: - type: string - URL: - type: string - Title of this paper: - type: string - -args: - loader_class: - description: | - The name of the LlamaIndex loader class to use. Make sure to provide the name and not the - id. The name is passed to `llama_index.download_loader` to download the specified loader. - type: str - loader_kwargs: - description: | - Keyword arguments to pass when instantiating the loader class. Check the documentation of - the loader to check which arguments it accepts. - type: str - load_kwargs: - description: | - Keyword arguments to pass to the `.load()` method of the loader. Check the documentation of - the loader to check which arguments it accepts. - type: str - additional_requirements: - description: | - Some loaders require additional dependencies to be installed. You can specify those here. - Use a format accepted by `pip install`. Eg. "pypdf" or "pypdf==3.17.1". Unfortunately - additional requirements for LlamaIndex loaders are not documented well, but if a dependency - is missing, a clear error message will be thrown. - type: list - default: [] - n_rows_to_load: - description: | - Optional argument that defines the number of rows to load. Useful for testing pipeline runs - on a small scale - type: int - default: None - index_column: - description: | - Column to set index to in the load component, if not specified a default globally unique - index will be set - type: str - default: None diff --git a/components/load_with_llamahub/tests/pytest.ini b/components/load_with_llamahub/tests/pytest.ini deleted file mode 100644 index bf6a8a517..000000000 --- a/components/load_with_llamahub/tests/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = ../src \ No newline at end of file diff --git a/components/load_with_llamahub/tests/requirements.txt b/components/load_with_llamahub/tests/requirements.txt deleted file mode 100644 index 2a929edcc..000000000 --- a/components/load_with_llamahub/tests/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==7.4.2 diff --git a/components/normalize_text/Dockerfile b/components/normalize_text/Dockerfile deleted file mode 100644 index c1e64c082..000000000 --- a/components/normalize_text/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM --platform=linux/amd64 python:3.8-slim as base - -# System dependencies -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install git -y - -# Install requirements -COPY requirements.txt / -RUN pip3 install --no-cache-dir -r requirements.txt - -# Install Fondant -# This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main -RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} - -# Set the working directory to the component folder -WORKDIR /component -COPY src/ src/ - -FROM base as test -COPY tests/ tests/ -RUN pip3 install --no-cache-dir -r tests/requirements.txt -RUN python -m pytest tests - -FROM base -WORKDIR /component/src -ENTRYPOINT ["fondant", "execute", "main"] diff --git a/components/normalize_text/README.md b/components/normalize_text/README.md deleted file mode 100644 index 9de782516..000000000 --- a/components/normalize_text/README.md +++ /dev/null @@ -1,84 +0,0 @@ -# Normalize text - - -## Description -This component implements several text normalization techniques to clean and preprocess textual -data: - -- Apply lowercasing: Converts all text to lowercase -- Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs -- Apply NFC normalization: Converts characters to their canonical representation -- Remove common seen patterns in webpages following the implementation of - [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf) -- Remove punctuation: Strips punctuation marks from the text - -These text normalization techniques are valuable for preparing text data before using it for -the training of large language models. - - - -## Inputs / outputs - - -### Consumes -**This component consumes:** - -- text: string - - - - - -### Produces -**This component produces:** - -- text: string - - - - -## Arguments - -The component takes the following arguments to alter its behavior: - -| argument | type | description | default | -| -------- | ---- | ----------- | ------- | -| remove_additional_whitespaces | bool | If true remove all additional whitespace, tabs. | / | -| apply_nfc | bool | If true apply nfc normalization | / | -| normalize_lines | bool | If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter | / | -| do_lowercase | bool | If true apply lowercasing | / | -| remove_punctuation | str | If true punctuation will be removed | / | - - -## Usage - -You can add this component to your pipeline using the following code: - -```python -from fondant.pipeline import Pipeline - - -pipeline = Pipeline(...) - -dataset = pipeline.read(...) - -dataset = dataset.apply( - "normalize_text", - arguments={ - # Add arguments - # "remove_additional_whitespaces": False, - # "apply_nfc": False, - # "normalize_lines": False, - # "do_lowercase": False, - # "remove_punctuation": , - }, -) -``` - - -## Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` diff --git a/components/normalize_text/fondant_component.yaml b/components/normalize_text/fondant_component.yaml deleted file mode 100644 index 35b6c79de..000000000 --- a/components/normalize_text/fondant_component.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: Normalize text -image: fndnt/normalize_text:latest -description: | - This component implements several text normalization techniques to clean and preprocess textual - data: - - - Apply lowercasing: Converts all text to lowercase - - Remove unnecessary whitespaces: Eliminates extra spaces between words, e.g. tabs - - Apply NFC normalization: Converts characters to their canonical representation - - Remove common seen patterns in webpages following the implementation of - [Penedo et al.](https://arxiv.org/pdf/2306.01116.pdf) - - Remove punctuation: Strips punctuation marks from the text - - These text normalization techniques are valuable for preparing text data before using it for - the training of large language models. -tags: - - Text processing - -consumes: - text: - type: string - -produces: - text: - type: string - -args: - remove_additional_whitespaces: - description: If true remove all additional whitespace, tabs. - type: bool - apply_nfc: - description: If true apply nfc normalization - type: bool - normalize_lines: - description: If true analyze documents line-by-line and apply various rules to discard or edit lines. Used to removed common patterns in webpages, e.g. counter - type: bool - do_lowercase: - description: If true apply lowercasing - type: bool - remove_punctuation: - description: If true punctuation will be removed - type: str \ No newline at end of file diff --git a/components/normalize_text/requirements.txt b/components/normalize_text/requirements.txt deleted file mode 100644 index a4299def8..000000000 --- a/components/normalize_text/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -ftfy==6.1.1 \ No newline at end of file diff --git a/components/normalize_text/src/main.py b/components/normalize_text/src/main.py deleted file mode 100644 index cda2939cf..000000000 --- a/components/normalize_text/src/main.py +++ /dev/null @@ -1,119 +0,0 @@ -"""A component that normalizes text.""" -import logging -import re -import string -from typing import List - -import ftfy -import pandas as pd -from fondant.component import PandasTransformComponent -from utils import is_counter, is_one_word, mainly_uppercase, only_numerical - -logger = logging.getLogger(__name__) - - -def _remove_punctuation(text): - """Remove punctuation in given text.""" - return text.translate(str.maketrans("", "", string.punctuation)) - - -def _remove_additional_whitespaces(text): - """ - Text cleaning method from slimpajama approach. - https://github.com/Cerebras/modelzoo/blob/main/modelzoo/transformers/data_processing/slimpajama/preprocessing/filter.py - Apply remove punctuation, and remove consecutive spaces, newlines, tabs in the middle - and in the beginning / end. - """ - return re.sub(r"\s+", " ", text.strip()) - - -def normalize_lines(text): - def any_condition_met(line, discard_condition_functions): - return any(condition(line) for condition in discard_condition_functions) - - discard_conditions = [mainly_uppercase, only_numerical, is_counter, is_one_word] - return " ".join( - [ - line - for line in text.split("\n") - if not any_condition_met(line, discard_conditions) - ], - ) - - -class NormalizeTextComponent(PandasTransformComponent): - """Component that normalizes text.""" - - def __init__( - self, - *, - remove_additional_whitespaces: bool, - apply_nfc: bool, - normalize_lines: bool, - do_lowercase: bool, - remove_punctuation: bool, - ): - self.remove_additional_whitespaces = remove_additional_whitespaces - self.apply_nfc = apply_nfc - self.normalize_lines = normalize_lines - self.do_lowercase = do_lowercase - self.remove_punctuation = remove_punctuation - - @staticmethod - def _do_nfc_normalization(text: str): - """Apply nfc normalization to the text of the dataframe.""" - return ftfy.fix_text(text, normalization="NFC") - - @staticmethod - def _remove_patterns(regex_patterns: List[str], text: str): - """Remove each regex pattern in the provided string.""" - for pattern in regex_patterns: - text = re.sub(pattern, "", text) - return text - - def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - """ - Apply normalization transformations. The component is capable of: - - NFC normalization - - Lowercasing - - Removing of unnecessary whitespaces (e.g. tabs), punctuation - - Apply line-wise transformations that exclude lines matching specified patterns. - Patterns include lines that are mainly composed of uppercase characters, lines that consist - only of numerical characters, lines that are counters (e.g., "3 likes"), and lines - that contain only one word. - - Args: - dataframe: Pandas dataframe. - - Returns: - Pandas dataframe - """ - if self.normalize_lines: - dataframe["text"] = dataframe["text"].apply( - normalize_lines, - ) - - if self.do_lowercase: - dataframe["text"] = dataframe["text"].apply( - lambda x: x.lower(), - ) - - if self.apply_nfc: - dataframe["text"] = dataframe["text"].apply( - self._do_nfc_normalization, - ) - - if self.remove_punctuation: - dataframe["text"] = dataframe["text"].apply( - _remove_punctuation, - ) - - if self.remove_additional_whitespaces: - dataframe["text"] = dataframe["text"].apply( - _remove_additional_whitespaces, - ) - - # remove all empty rows - dataframe = dataframe[dataframe["text"].astype(bool)] - - return dataframe diff --git a/components/normalize_text/src/utils.py b/components/normalize_text/src/utils.py deleted file mode 100644 index b487bc61e..000000000 --- a/components/normalize_text/src/utils.py +++ /dev/null @@ -1,65 +0,0 @@ -import re - - -def mainly_uppercase(line: str, threshold: float = 0.7) -> bool: - """ - Checks if a line is mainly composed of uppercase characters. - - Args: - line: The input line to check. - threshold (float): The threshold (between 0 and 1) to determine what is considered - "mainly uppercase." - - Returns: - bool: True if the line is mainly uppercase, False otherwise. - """ - uppercase_count = sum(1 for char in line if char.isupper()) - total_chars = len(line) - if total_chars == 0: - return False - - uppercase_ratio = uppercase_count / total_chars - return uppercase_ratio >= threshold - - -def only_numerical(line: str) -> bool: - """ - Checks if a line is composed only of numerical characters. - - Args: - line: The input line to check. - - Returns: - bool: True if the line is only composed of numerical characters, False otherwise. - """ - return line.isdigit() - - -def is_counter(line: str) -> bool: - """ - Checks if a line represents a counter (e.g., "3 likes"). - - Args: - line: The input line to check. - - Returns: - bool: True if the line represents a counter, False otherwise. - """ - # Use regular expression to check for the pattern: - line = line.strip() - pattern = r"^\d+\s+\S+$" - return re.match(pattern, line) is not None - - -def is_one_word(line: str) -> bool: - """ - Checks if a line contains only one word. - - Args: - line: The input line to check. - - Returns: - bool: True if the line contains only one word, False otherwise. - """ - words = line.split() - return len(words) == 1 diff --git a/components/normalize_text/tests/component_test.py b/components/normalize_text/tests/component_test.py deleted file mode 100644 index d7f52967e..000000000 --- a/components/normalize_text/tests/component_test.py +++ /dev/null @@ -1,41 +0,0 @@ -import pandas as pd - -from src.main import NormalizeTextComponent - - -def test_transform_custom_componen_test(): - """Test components transform method.""" - user_arguments = { - "remove_additional_whitespaces": True, - "apply_nfc": True, - "normalize_lines": True, - "do_lowercase": True, - "remove_punctuation": True, - } - component = NormalizeTextComponent(**user_arguments) - - input_dataframe = pd.DataFrame( - [ - "\u0043\u0327 something", - "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", - "Nulla facilisi. Sed eu nulla sit amet enim scelerisque dapibus.", - ], - columns=["text"], - ) - - expected_output = pd.DataFrame( - [ - "\u00e7 something", - "lorem ipsum dolor sit amet consectetur adipiscing elit", - "nulla facilisi sed eu nulla sit amet enim scelerisque dapibus", - ], - columns=["text"], - ) - - output_dataframe = component.transform(input_dataframe) - - pd.testing.assert_frame_equal( - left=expected_output, - right=output_dataframe, - check_dtype=False, - ) diff --git a/components/normalize_text/tests/pytest.ini b/components/normalize_text/tests/pytest.ini deleted file mode 100644 index bf6a8a517..000000000 --- a/components/normalize_text/tests/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = ../src \ No newline at end of file diff --git a/components/normalize_text/tests/requirements.txt b/components/normalize_text/tests/requirements.txt deleted file mode 100644 index 6950eb5a7..000000000 --- a/components/normalize_text/tests/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==7.4.0 \ No newline at end of file diff --git a/components/normalize_text/tests/utils_test.py b/components/normalize_text/tests/utils_test.py deleted file mode 100644 index 8a0892bb2..000000000 --- a/components/normalize_text/tests/utils_test.py +++ /dev/null @@ -1,46 +0,0 @@ -from src.utils import ( - is_counter, - is_one_word, - mainly_uppercase, - only_numerical, -) - - -def test_mainly_uppercase(): - line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" - assert mainly_uppercase(line, threshold=0.5) - - -def test_mainly_uppercase_under_threshold(): - line = "HELLO WORLD not upper SOMETHING ELSE IN UPPERCASE" - assert not mainly_uppercase(line, threshold=0.9) - - -def test_only_numerical(): - line = "42" - assert only_numerical(line) - - -def test_only_numerical_on_words(): - line = "42 lorem ipsum" - assert not only_numerical(line) - - -def test_is_counter(): - line = "13 Likes" - assert is_counter(line) - - -def test_is_not_counter(): - line = "Hello world! 42 people are part of .." - assert not is_counter(line) - - -def test_is_one_word(): - line = "word" - assert is_one_word(line) - - -def test_is_not_one_word(): - line = "two words" - assert not is_one_word(line) diff --git a/components/retrieve_from_weaviate/Dockerfile b/components/retrieve_from_weaviate/Dockerfile deleted file mode 100644 index 5de6e945f..000000000 --- a/components/retrieve_from_weaviate/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -FROM --platform=linux/amd64 python:3.8-slim as base - -# System dependencies -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install git -y - -# Install requirements -COPY requirements.txt / -RUN pip3 install --no-cache-dir -r requirements.txt - -# Install Fondant -# This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main -RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} - -# Set the working directory to the component folder -WORKDIR /component -COPY src/ src/ - -FROM base as test -COPY tests/ tests/ -RUN pip3 install --no-cache-dir -r tests/requirements.txt -RUN python -m pytest tests - -FROM base -WORKDIR /component/src -ENTRYPOINT ["fondant", "execute", "main"] - diff --git a/components/retrieve_from_weaviate/README.md b/components/retrieve_from_weaviate/README.md deleted file mode 100644 index e5c3337ce..000000000 --- a/components/retrieve_from_weaviate/README.md +++ /dev/null @@ -1,206 +0,0 @@ -# retrieve_from_weaviate - - -## Description -Component that retrieves chunks from a Weaviate vector database. -The component can retrieve chunks based on a text search or based on a vector search. -Reranking is only supported for text search. -More info here [Cohere Ranking](https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb) -[Weaviate Search Rerank](https://weaviate.io/developers/weaviate/search/rerank) - -### Running with text as input - -```python -import pyarrow as pa -from fondant.pipeline import Pipeline - -pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline") - -dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/dataset.csv", - }, - produces={ - "text": pa.string(), - } -) - -dataset = dataset.apply( - "index_weaviate", - arguments={ - "weaviate_url": "http://localhost:8080", - "class_name": "my_class", - "vectorizer": "text2vec-openai", - "additional_headers": { - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY" - } - }, - consumes={ - "text": "text" - } -) - -dataset = dataset.apply( - "retrieve_from_weaviate", - arguments={ - "weaviate_url": "http://localhost:8080", - "class_name": "my_class", - "top_k": 3, - "additional_headers": { - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY" - } - }, - consumes={ - "text": "text" - } -) -``` - -```python -import pyarrow as pa -from fondant.pipeline import Pipeline - -pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline") - -dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/dataset.csv", - }, - produces={ - "text": pa.string(), - } -) - -dataset = dataset.apply( - "embed_text", - arguments={...}, - consumes={ - "text": "text", - }, -) - -dataset = dataset.apply( - "index_weaviate", - arguments={ - "weaviate_url": "http://localhost:8080", - "class_name": "my_class", - }, - consumes={ - "embedding": "embedding" - } -) - -dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/prompt_dataset.csv", - }, - produces={ - "prompts": pa.string(), - } -) - -dataset = dataset.apply( - "embed_text", - arguments={...}, - consumes={ - "prompts": "text", - }, -) - -dataset = dataset.apply( - "retrieve_from_weaviate", - arguments={ - "weaviate_url": "http://localhost:8080", - "class_name": "my_class", - "top_k": 3, - consumes={ - "text": "text" - } -) -``` - - - -## Inputs / outputs - - -### Consumes - -**This component can consume additional fields** -- : -This defines a mapping to update the fields consumed by the operation as defined in the component spec. -The keys are the names of the fields to be received by the component, while the values are -the name of the field to map from the input dataset - -See the usage example below on how to define a field name for additional fields. - - - - - -### Produces -**This component produces:** - -- retrieved_chunks: list - - - - -## Arguments - -The component takes the following arguments to alter its behavior: - -| argument | type | description | default | -| -------- | ---- | ----------- | ------- | -| weaviate_url | str | The URL of the weaviate instance. | http://localhost:8080 | -| class_name | str | The name of the weaviate class that will be queried | / | -| top_k | int | Number of chunks to retrieve | / | -| additional_config | dict | Additional configuration to pass to the weaviate client. | / | -| additional_headers | dict | Additional headers to pass to the weaviate client. | / | -| hybrid_query | str | The hybrid query to be used for retrieval. Optional parameter. | / | -| hybrid_alpha | float | Argument to change how much each search affects the results. An alpha of 1 is a pure vector search. An alpha of 0 is a pure keyword search. | / | -| rerank | bool | Whether to rerank the results based on the hybrid query. Defaults to False.Check this notebook for more information on reranking:https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynbhttps://weaviate.io/developers/weaviate/search/rerank. | / | - - -## Usage - -You can add this component to your pipeline using the following code: - -```python -from fondant.pipeline import Pipeline - - -pipeline = Pipeline(...) - -dataset = pipeline.read(...) - -dataset = dataset.apply( - "retrieve_from_weaviate", - arguments={ - # Add arguments - # "weaviate_url": "http://localhost:8080", - # "class_name": , - # "top_k": 0, - # "additional_config": {}, - # "additional_headers": {}, - # "hybrid_query": , - # "hybrid_alpha": 0.0, - # "rerank": False, - }, - consumes={ - : , - ..., # Add fields - }, -) -``` - - -## Testing - -You can run the tests using docker with BuildKit. From this directory, run: -``` -docker build . --target test -``` diff --git a/components/retrieve_from_weaviate/fondant_component.yaml b/components/retrieve_from_weaviate/fondant_component.yaml deleted file mode 100644 index 7b082b37b..000000000 --- a/components/retrieve_from_weaviate/fondant_component.yaml +++ /dev/null @@ -1,175 +0,0 @@ -name: retrieve_from_weaviate -description: | - Component that retrieves chunks from a Weaviate vector database. - The component can retrieve chunks based on a text search or based on a vector search. - Reranking is only supported for text search. - More info here [Cohere Ranking](https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb) - [Weaviate Search Rerank](https://weaviate.io/developers/weaviate/search/rerank) - - ### Running with text as input - - ```python - import pyarrow as pa - from fondant.pipeline import Pipeline - - pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline") - - dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/dataset.csv", - }, - produces={ - "text": pa.string(), - } - ) - - dataset = dataset.apply( - "index_weaviate", - arguments={ - "weaviate_url": "http://localhost:8080", - "class_name": "my_class", - "vectorizer": "text2vec-openai", - "additional_headers": { - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY" - } - }, - consumes={ - "text": "text" - } - ) - - dataset = dataset.apply( - "retrieve_from_weaviate", - arguments={ - "weaviate_url": "http://localhost:8080", - "class_name": "my_class", - "top_k": 3, - "additional_headers": { - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY" - } - }, - consumes={ - "text": "text" - } - ) - ``` - - ```python - import pyarrow as pa - from fondant.pipeline import Pipeline - - pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline") - - dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/dataset.csv", - }, - produces={ - "text": pa.string(), - } - ) - - dataset = dataset.apply( - "embed_text", - arguments={...}, - consumes={ - "text": "text", - }, - ) - - dataset = dataset.apply( - "index_weaviate", - arguments={ - "weaviate_url": "http://localhost:8080", - "class_name": "my_class", - }, - consumes={ - "embedding": "embedding" - } - ) - - dataset = pipeline.read( - "load_from_csv", - arguments={ - "dataset_uri": "path/to/prompt_dataset.csv", - }, - produces={ - "prompts": pa.string(), - } - ) - - dataset = dataset.apply( - "embed_text", - arguments={...}, - consumes={ - "prompts": "text", - }, - ) - - dataset = dataset.apply( - "retrieve_from_weaviate", - arguments={ - "weaviate_url": "http://localhost:8080", - "class_name": "my_class", - "top_k": 3, - consumes={ - "text": "text" - } - ) - ``` - -image: fndnt/retrieve_from_weaviate:dev -tags: - - Data retrieval - -consumes: - additionalProperties: true - -produces: - retrieved_chunks: - type: array - items: - type: string - -args: - weaviate_url: - description: The URL of the weaviate instance. - type: str - default: http://localhost:8080 - class_name: - description: - The name of the weaviate class that will be queried - type: str - top_k: - description: Number of chunks to retrieve - type: int - additional_config: - description: | - Additional configuration to pass to the weaviate client. - type: dict - default: {} - additional_headers: - description: | - Additional headers to pass to the weaviate client. - type: dict - default: {} - hybrid_query: - description: | - The hybrid query to be used for retrieval. Optional parameter. - type: str - default: None - hybrid_alpha: - description: | - Argument to change how much each search affects the results. An alpha of 1 is a pure vector search. An alpha of 0 is a pure keyword search. - type: float - default: None - rerank: - description: | - Whether to rerank the results based on the hybrid query. Defaults to False. - Check this notebook for more information on reranking: - https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb - https://weaviate.io/developers/weaviate/search/rerank. - type: bool - default: False \ No newline at end of file diff --git a/components/retrieve_from_weaviate/requirements.txt b/components/retrieve_from_weaviate/requirements.txt deleted file mode 100644 index 12e81349f..000000000 --- a/components/retrieve_from_weaviate/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -weaviate-client==3.24.1 \ No newline at end of file diff --git a/components/retrieve_from_weaviate/src/main.py b/components/retrieve_from_weaviate/src/main.py deleted file mode 100644 index f69f2e684..000000000 --- a/components/retrieve_from_weaviate/src/main.py +++ /dev/null @@ -1,133 +0,0 @@ -import typing as t - -import dask.dataframe as dd -import pandas as pd -import weaviate -from fondant.component import PandasTransformComponent - - -class RetrieveFromWeaviateComponent(PandasTransformComponent): - def __init__( - self, - *, - weaviate_url: str, - class_name: str, - top_k: int, - additional_config: t.Optional[dict], - additional_headers: t.Optional[dict], - hybrid_query: t.Optional[str], - hybrid_alpha: t.Optional[float], - rerank: bool, - ) -> None: - """ - Args: - weaviate_url: An argument passed to the component. - class_name: Name of class to query - top_k: Amount of context to return. - additional_config: Additional configuration passed to the weaviate client. - additional_headers: Additional headers passed to the weaviate client. - hybrid_query: The hybrid query to be used for retrieval. Optional parameter. - hybrid_alpha: Argument to change how much each search affects the results. An alpha - of 1 is a pure vector search. An alpha of 0 is a pure keyword search. - rerank: Whether to rerank the results based on the hybrid query. Defaults to False. - Check this notebook for more information on reranking: - https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb - https://weaviate.io/developers/weaviate/search/rerank. - """ - # Initialize your component here based on the arguments - self.client = weaviate.Client( - url=weaviate_url, - additional_config=additional_config if additional_config else None, - additional_headers=additional_headers if additional_headers else None, - ) - self.class_name = class_name - self.k = top_k - self.hybrid_query, self.hybrid_alpha = self.validate_hybrid_query( - hybrid_query, - hybrid_alpha, - ) - self.rerank = rerank - - @staticmethod - def validate_hybrid_query( - hybrid_query: t.Optional[str], - hybrid_alpha: t.Optional[float], - ): - if hybrid_query is not None and hybrid_alpha is None: - msg = ( - "If hybrid_query is specified, hybrid_alpha must be specified as well." - ) - raise ValueError( - msg, - ) - - return hybrid_query, hybrid_alpha - - def validate_reranker(self, dataframe: dd.DataFrame) -> None: - if self.rerank and "prompt" not in dataframe.columns: - msg = ( - "If rerank is specified, dataframe must contain a 'text' column. Reranking is" - " only supported for text data and not for embeddings." - ) - raise ValueError( - msg, - ) - - def teardown(self) -> None: - del self.client - - def retrieve_chunks_from_embeddings(self, vector_query: list): - """Get results from weaviate database.""" - query = ( - self.client.query.get(self.class_name, ["passage"]) - .with_near_vector({"vector": vector_query}) - .with_limit(self.k) - .with_additional(["distance"]) - ) - if self.hybrid_query is not None: - query = query.with_hybrid(query=self.hybrid_query, alpha=self.hybrid_alpha) - - result = query.do() - - result_dict = result["data"]["Get"][self.class_name] - return [retrieved_chunk["passage"] for retrieved_chunk in result_dict] - - def retrieve_chunks_from_prompts(self, prompt: str): - """Get results from weaviate database.""" - query = ( - self.client.query.get(self.class_name, ["passage"]) - .with_near_text({"concepts": [prompt]}) - .with_limit(self.k) - ) - if self.hybrid_query is not None: - query = query.with_hybrid(query=self.hybrid_query, alpha=self.hybrid_alpha) - - if self.rerank: - query = query.with_additional( - 'rerank(property: "passage" query: "prompt") { score }', - ) - - result = query.do() - - result_dict = result["data"]["Get"][self.class_name] - return [retrieved_chunk["passage"] for retrieved_chunk in result_dict] - - def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - self.validate_reranker(dataframe) - - if "embedding" in dataframe.columns: - dataframe["retrieved_chunks"] = dataframe["embedding"].apply( - self.retrieve_chunks_from_embeddings, - ) - - elif "prompt" in dataframe.columns: - dataframe["retrieved_chunks"] = dataframe["prompt"].apply( - self.retrieve_chunks_from_prompts, - ) - else: - msg = "Dataframe must contain either an 'embedding' column or a 'prompt' column." - raise ValueError( - msg, - ) - - return dataframe diff --git a/components/retrieve_from_weaviate/tests/pytest.ini b/components/retrieve_from_weaviate/tests/pytest.ini deleted file mode 100644 index bf6a8a517..000000000 --- a/components/retrieve_from_weaviate/tests/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = ../src \ No newline at end of file diff --git a/components/retrieve_from_weaviate/tests/requirements.txt b/components/retrieve_from_weaviate/tests/requirements.txt deleted file mode 100644 index 2a929edcc..000000000 --- a/components/retrieve_from_weaviate/tests/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==7.4.2 diff --git a/components/retrieve_from_weaviate/tests/test_component.py b/components/retrieve_from_weaviate/tests/test_component.py deleted file mode 100644 index 7b30898f1..000000000 --- a/components/retrieve_from_weaviate/tests/test_component.py +++ /dev/null @@ -1,80 +0,0 @@ -import tempfile - -import numpy as np -import pandas as pd -import weaviate -from weaviate.embedded import EmbeddedOptions - -from src.main import RetrieveFromWeaviateComponent - - -def set_up_instance(client): - """Set up an embedded instance using the provided client.""" - data = [ - { - "data_object": { - "passage": "foo", - }, - "vector": np.array([1.0, 2.0]), - }, - { - "data_object": { - "passage": "bar", - }, - "vector": np.array([2.0, 3.0]), - }, - ] - - for entry in data: - client.data_object.create( - entry["data_object"], - class_name="Test", - vector=entry["vector"], - ) - - return "http://localhost:6666" - - -def test_component(): - input_dataframe = pd.DataFrame.from_dict( - { - "id": ["1", "2"], - "embedding": [np.array([1.0, 2.0]), np.array([2.0, 3.0])], - }, - ) - input_dataframe = input_dataframe.set_index("id") - - expected_output_dataframe = pd.DataFrame.from_dict( - { - "id": ["1", "2"], - "retrieved_chunks": [["foo", "bar"], ["bar", "foo"]], - }, - ) - expected_output_dataframe = expected_output_dataframe.set_index("id") - - with tempfile.TemporaryDirectory() as tmpdir: - client = weaviate.Client( - embedded_options=EmbeddedOptions( - persistence_data_path=tmpdir, - ), - ) - url = set_up_instance(client) - - component = RetrieveFromWeaviateComponent( - weaviate_url=url, - class_name="Test", - top_k=2, - additional_config={}, - additional_headers={}, - hybrid_query=None, - hybrid_alpha=None, - rerank=False, - ) - - output_dataframe = component.transform(input_dataframe) - - pd.testing.assert_frame_equal( - left=expected_output_dataframe, - right=output_dataframe["retrieved_chunks"].to_frame(), - check_dtype=False, - )