diff --git a/components/caption_images/README.md b/components/caption_images/README.md index 8bb38e996..401747cbb 100644 --- a/components/caption_images/README.md +++ b/components/caption_images/README.md @@ -7,13 +7,11 @@ This component captions images using a BLIP model from the Hugging Face hub **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- captions - - text: string +- captions_text: string ### Arguments diff --git a/components/caption_images/fondant_component.yaml b/components/caption_images/fondant_component.yaml index 7a72cd815..3da8e4720 100644 --- a/components/caption_images/fondant_component.yaml +++ b/components/caption_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - text: - type: utf8 + captions_text: + type: utf8 args: model_id: diff --git a/components/caption_images/src/main.py b/components/caption_images/src/main.py index 934ea09ce..86be52b40 100644 --- a/components/caption_images/src/main.py +++ b/components/caption_images/src/main.py @@ -90,7 +90,7 @@ def __init__( self.max_new_tokens = max_new_tokens def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - images = dataframe["images"]["data"] + images = dataframe["images_data"] results: t.List[pd.Series] = [] for batch in np.split( @@ -112,4 +112,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ).T results.append(captions) - return pd.concat(results).to_frame(name=("captions", "text")) + return pd.concat(results).to_frame(name=("captions_text")) diff --git a/components/chunk_text/README.md b/components/chunk_text/README.md index 97b3309e0..a12d74980 100644 --- a/components/chunk_text/README.md +++ b/components/chunk_text/README.md @@ -11,14 +11,12 @@ consists of the id of the original document followed by the chunk index. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - data: string - - original_document_id: string +- text_data: string +- text_original_document_id: string ### Arguments diff --git a/components/chunk_text/fondant_component.yaml b/components/chunk_text/fondant_component.yaml index d266b4dac..159e67556 100644 --- a/components/chunk_text/fondant_component.yaml +++ b/components/chunk_text/fondant_component.yaml @@ -10,18 +10,14 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - data: - type: string - original_document_id: - type: string + text_data: + type: string + text_original_document_id: + type: string args: chunk_size: diff --git a/components/chunk_text/src/main.py b/components/chunk_text/src/main.py index 8c41220d2..da46cbbd7 100644 --- a/components/chunk_text/src/main.py +++ b/components/chunk_text/src/main.py @@ -38,7 +38,7 @@ def __init__( def chunk_text(self, row) -> t.List[t.Tuple]: # Multi-index df has id under the name attribute doc_id = row.name - text_data = row[("text", "data")] + text_data = row[("text_data")] docs = self.text_splitter.create_documents([text_data]) return [ (doc_id, f"{doc_id}_{chunk_id}", chunk.page_content) @@ -63,9 +63,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ) results_df = results_df.set_index("id") - # Set multi-index column for the expected subset and field - results_df.columns = pd.MultiIndex.from_product( - [["text"], results_df.columns], - ) - return results_df diff --git a/components/chunk_text/tests/chunk_text_test.py b/components/chunk_text/tests/chunk_text_test.py index a47683ed3..f95180f98 100644 --- a/components/chunk_text/tests/chunk_text_test.py +++ b/components/chunk_text/tests/chunk_text_test.py @@ -7,7 +7,7 @@ def test_transform(): """Test chunk component method.""" input_dataframe = pd.DataFrame( { - ("text", "data"): [ + ("text_data"): [ "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo", "ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis", "parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec,", @@ -25,8 +25,8 @@ def test_transform(): expected_output_dataframe = pd.DataFrame( { - ("text", "original_document_id"): ["a", "a", "a", "b", "b", "c", "c"], - ("text", "data"): [ + ("text_original_document_id"): ["a", "a", "a", "b", "b", "c", "c"], + ("text_data"): [ "Lorem ipsum dolor sit amet, consectetuer", "amet, consectetuer adipiscing elit. Aenean", "elit. Aenean commodo", diff --git a/components/download_images/README.md b/components/download_images/README.md index b491007b5..6ed54d66d 100644 --- a/components/download_images/README.md +++ b/components/download_images/README.md @@ -14,15 +14,13 @@ from the img2dataset library. **This component consumes:** -- images - - url: string +- images_url: string **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 1982a96ba..91efeca15 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -13,21 +13,16 @@ tags: - Image processing consumes: - images: - fields: - url: - type: string + images_url: + type: string produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 - additionalFields: false + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 args: timeout: @@ -53,7 +48,7 @@ args: description: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". type: str default: 'border' - resize_only_if_bigger: + resize_only_if_bigger: description: If True, resize only if image is bigger than image_size. type: bool default: False diff --git a/components/download_images/src/main.py b/components/download_images/src/main.py index 8a37b86eb..070859e07 100644 --- a/components/download_images/src/main.py +++ b/components/download_images/src/main.py @@ -119,7 +119,7 @@ async def download_dataframe() -> None: images = await asyncio.gather( *[ self.download_and_resize_image(id_, url, semaphore=semaphore) - for id_, url in zip(dataframe.index, dataframe["images"]["url"]) + for id_, url in zip(dataframe.index, dataframe["images_url"]) ], ) results.extend(images) @@ -134,8 +134,5 @@ async def download_dataframe() -> None: results_df = results_df.dropna() results_df = results_df.set_index("id", drop=True) - results_df.columns = pd.MultiIndex.from_product( - [["images"], results_df.columns], - ) return results_df diff --git a/components/download_images/tests/test_component.py b/components/download_images/tests/test_component.py index 1f690e6e5..d851ecd73 100644 --- a/components/download_images/tests/test_component.py +++ b/components/download_images/tests/test_component.py @@ -45,7 +45,7 @@ def test_transform(respx_mock): input_dataframe = pd.DataFrame( { - ("images", "url"): urls, + "images_url": urls, }, index=pd.Index(ids, name="id"), ) @@ -55,9 +55,9 @@ def test_transform(respx_mock): resized_images = [component.resizer(io.BytesIO(image))[0] for image in images] expected_dataframe = pd.DataFrame( { - ("images", "data"): resized_images, - ("images", "width"): [image_size] * len(ids), - ("images", "height"): [image_size] * len(ids), + "images_data": resized_images, + "images_width": [image_size] * len(ids), + "images_height": [image_size] * len(ids), }, index=pd.Index(ids, name="id"), ) diff --git a/components/embed_images/README.md b/components/embed_images/README.md index eec02f577..23e746136 100644 --- a/components/embed_images/README.md +++ b/components/embed_images/README.md @@ -7,13 +7,11 @@ Component that generates CLIP embeddings from images **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- embeddings - - data: list +- embeddings_data: list ### Arguments diff --git a/components/embed_images/fondant_component.yaml b/components/embed_images/fondant_component.yaml index a176b2f6b..86fdb53a4 100644 --- a/components/embed_images/fondant_component.yaml +++ b/components/embed_images/fondant_component.yaml @@ -2,21 +2,17 @@ name: Embed images description: Component that generates CLIP embeddings from images image: fndnt/embed_images:dev tags: - - Image processing + - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 args: model_id: diff --git a/components/embed_images/src/main.py b/components/embed_images/src/main.py index 03c647dc0..a0270b1e8 100644 --- a/components/embed_images/src/main.py +++ b/components/embed_images/src/main.py @@ -90,7 +90,7 @@ def __init__( self.batch_size = batch_size def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - images = dataframe["images"]["data"] + images = dataframe["images_data"] results: t.List[pd.Series] = [] for batch in np.split( @@ -110,4 +110,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: ).T results.append(embeddings) - return pd.concat(results).to_frame(name=("embeddings", "data")) + return pd.concat(results).to_frame(name=("embeddings_data")) diff --git a/components/embed_text/README.md b/components/embed_text/README.md index a30a9ec4f..c53a779b9 100644 --- a/components/embed_text/README.md +++ b/components/embed_text/README.md @@ -7,14 +7,12 @@ Component that generates embeddings of text passages. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - data: string - - embedding: list +- text_data: string +- text_embedding: list ### Arguments diff --git a/components/embed_text/fondant_component.yaml b/components/embed_text/fondant_component.yaml index 2e34c5c0a..a1a3ca816 100644 --- a/components/embed_text/fondant_component.yaml +++ b/components/embed_text/fondant_component.yaml @@ -5,21 +5,17 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 - + text_data: + type: string + text_embedding: + type: array + items: + type: float32 + args: model_provider: description: | diff --git a/components/embed_text/src/main.py b/components/embed_text/src/main.py index c8c2acfde..3fdc08e47 100644 --- a/components/embed_text/src/main.py +++ b/components/embed_text/src/main.py @@ -65,7 +65,7 @@ def get_embeddings_vectors(self, texts): return self.embedding_model.embed_documents(texts.tolist()) def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - dataframe[("text", "embedding")] = self.get_embeddings_vectors( - dataframe[("text", "data")], + dataframe["text_embedding"] = self.get_embeddings_vectors( + dataframe["text_data"], ) return dataframe diff --git a/components/embedding_based_laion_retrieval/Dockerfile b/components/embedding_based_laion_retrieval/Dockerfile index 72525d884..0cdcde81a 100644 --- a/components/embedding_based_laion_retrieval/Dockerfile +++ b/components/embedding_based_laion_retrieval/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.8-slim +FROM --platform=linux/amd64 python:3.8-slim as base # System dependencies RUN apt-get update && \ @@ -16,8 +16,15 @@ RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team # Set the working directory to the component folder WORKDIR /component/src +COPY src/ src/ +ENV PYTHONPATH "${PYTHONPATH}:./src" -# Copy over src-files -COPY src/ . +FROM base as test +COPY test_requirements.txt . +RUN pip3 install --no-cache-dir -r test_requirements.txt +COPY tests/ tests/ +RUN python -m pytest tests -ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file +FROM base +WORKDIR /component/src +ENTRYPOINT ["fondant", "execute", "main"] diff --git a/components/embedding_based_laion_retrieval/README.md b/components/embedding_based_laion_retrieval/README.md index 454253416..97e0866a5 100644 --- a/components/embedding_based_laion_retrieval/README.md +++ b/components/embedding_based_laion_retrieval/README.md @@ -9,13 +9,12 @@ used to find images similar to the embedded images / captions. **This component consumes:** -- embeddings - - data: list +- embeddings_data: list **This component produces:** -- images - - url: string +- images_url: string +- embedding_id: string ### Arguments @@ -47,3 +46,9 @@ embedding_based_laion_retrieval_op = ComponentOp.from_registry( pipeline.add_op(embedding_based_laion_retrieval_op, dependencies=[...]) #Add previous component as dependency ``` +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` diff --git a/components/embedding_based_laion_retrieval/fondant_component.yaml b/components/embedding_based_laion_retrieval/fondant_component.yaml index d93e634a3..d7616cfbd 100644 --- a/components/embedding_based_laion_retrieval/fondant_component.yaml +++ b/components/embedding_based_laion_retrieval/fondant_component.yaml @@ -7,19 +7,18 @@ tags: - Data retrieval consumes: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 produces: - images: - fields: - url: - type: string - additionalSubsets: false + images_url: + type: string + embedding_id: + type: string + +previous_index: embedding_id args: num_images: diff --git a/components/embedding_based_laion_retrieval/src/main.py b/components/embedding_based_laion_retrieval/src/main.py index b350e6142..4d730f24c 100644 --- a/components/embedding_based_laion_retrieval/src/main.py +++ b/components/embedding_based_laion_retrieval/src/main.py @@ -1,7 +1,6 @@ """This component retrieves image URLs from LAION-5B based on a set of CLIP embeddings.""" import asyncio import concurrent.futures -import functools import logging import typing as t @@ -40,6 +39,10 @@ def __init__( modality=Modality.IMAGE, ) + def query(self, id_: t.Any, embedding: t.List[float]) -> t.List[t.Dict]: + results = self.client.query(embedding_input=embedding) + return [dict(d, embedding_id=id_) for d in results] + def transform( self, dataframe: pd.DataFrame, @@ -53,23 +56,20 @@ async def async_query(): futures = [ loop.run_in_executor( executor, - functools.partial( - self.client.query, - embedding_input=embedding.tolist(), - ), + self.query, + row.id, + row.embeddings_data.tolist(), ) - for embedding in dataframe["embeddings"]["data"] + for row in dataframe.itertuples() ] for response in await asyncio.gather(*futures): results.extend(response) loop.run_until_complete(async_query()) - results_df = pd.DataFrame(results)[["id", "url"]] + results_df = pd.DataFrame(results)[["id", "url", "embedding_id"]] results_df = results_df.set_index("id") - # Cast the index to string - results_df.index = results_df.index.astype(str) - results_df.columns = [["images"], ["url"]] + results_df.rename(columns={"url": "images_url"}) return results_df diff --git a/components/embedding_based_laion_retrieval/test_requirements.txt b/components/embedding_based_laion_retrieval/test_requirements.txt new file mode 100644 index 000000000..2a929edcc --- /dev/null +++ b/components/embedding_based_laion_retrieval/test_requirements.txt @@ -0,0 +1 @@ +pytest==7.4.2 diff --git a/components/embedding_based_laion_retrieval/tests/pytest.ini b/components/embedding_based_laion_retrieval/tests/pytest.ini new file mode 100644 index 000000000..bf6a8a517 --- /dev/null +++ b/components/embedding_based_laion_retrieval/tests/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = ../src \ No newline at end of file diff --git a/components/embedding_based_laion_retrieval/tests/test_component.py b/components/embedding_based_laion_retrieval/tests/test_component.py new file mode 100644 index 000000000..ba59028bf --- /dev/null +++ b/components/embedding_based_laion_retrieval/tests/test_component.py @@ -0,0 +1,66 @@ +import typing as t + +import numpy as np +import pandas as pd + +from src.main import LAIONRetrievalComponent + + +def test_component(monkeypatch): + def mocked_client_query(embedding_input: t.List[float]) -> t.List[dict]: + if embedding_input == [1, 2]: + return [ + { + "id": "a", + "url": "http://a", + }, + { + "id": "b", + "url": "http://b", + }, + ] + if embedding_input == [2, 3]: + return [ + { + "id": "c", + "url": "http://c", + }, + { + "id": "d", + "url": "http://d", + }, + ] + msg = f"Unexpected value: `embeddings_input` was {embedding_input}" + raise ValueError(msg) + + input_dataframe = pd.DataFrame.from_dict( + { + "id": ["1", "2"], + "embeddings_data": [np.array([1, 2]), np.array([2, 3])], + }, + ) + + expected_output_dataframe = pd.DataFrame.from_dict( + { + "id": ["a", "b", "c", "d"], + "url": ["http://a", "http://b", "http://c", "http://d"], + "embedding_id": ["1", "1", "2", "2"], + }, + ) + expected_output_dataframe = expected_output_dataframe.set_index("id") + + component = LAIONRetrievalComponent( + num_images=2, + aesthetic_score=9, + aesthetic_weight=0.5, + ) + + monkeypatch.setattr(component.client, "query", mocked_client_query) + + output_dataframe = component.transform(input_dataframe) + + pd.testing.assert_frame_equal( + left=expected_output_dataframe, + right=output_dataframe, + check_dtype=False, + ) diff --git a/components/filter_image_resolution/README.md b/components/filter_image_resolution/README.md index 1bc0c27f5..e7093e680 100644 --- a/components/filter_image_resolution/README.md +++ b/components/filter_image_resolution/README.md @@ -7,9 +7,8 @@ Component that filters images based on minimum size and max aspect ratio **This component consumes:** -- images - - width: int32 - - height: int32 +- images_width: int32 +- images_height: int32 **This component produces no data.** diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index 0512d87f9..b6ff8cbe7 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -5,12 +5,10 @@ tags: - Image processing consumes: - images: - fields: - width: - type: int32 - height: - type: int32 + images_width: + type: int32 + images_height: + type: int32 args: min_image_dim: diff --git a/components/filter_image_resolution/src/main.py b/components/filter_image_resolution/src/main.py index 8fbfdfa77..b169196ec 100644 --- a/components/filter_image_resolution/src/main.py +++ b/components/filter_image_resolution/src/main.py @@ -23,8 +23,8 @@ def __init__(self, *_, min_image_dim: int, max_aspect_ratio: float) -> None: self.max_aspect_ratio = max_aspect_ratio def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - width = dataframe["images"]["width"] - height = dataframe["images"]["height"] + width = dataframe["images_width"] + height = dataframe["images_height"] min_image_dim = np.minimum(width, height) max_image_dim = np.maximum(width, height) aspect_ratio = max_image_dim / min_image_dim diff --git a/components/filter_text_length/README.md b/components/filter_text_length/README.md index ed89dd128..4c5730180 100644 --- a/components/filter_text_length/README.md +++ b/components/filter_text_length/README.md @@ -7,8 +7,7 @@ A component that filters out text based on their length **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/filter_text_length/fondant_component.yaml b/components/filter_text_length/fondant_component.yaml index fee0fb242..2451f5981 100644 --- a/components/filter_text_length/fondant_component.yaml +++ b/components/filter_text_length/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: min_characters_length: diff --git a/components/filter_text_length/src/main.py b/components/filter_text_length/src/main.py index 3e2f472a4..e3a6b0d61 100644 --- a/components/filter_text_length/src/main.py +++ b/components/filter_text_length/src/main.py @@ -23,10 +23,10 @@ def __init__(self, *_, min_characters_length: int, min_words_length: int): def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: """Filter out text based on their length.""" - caption_num_words = dataframe["text"]["data"].apply( + caption_num_words = dataframe["text_data"].apply( lambda x: len(fasttext.tokenize(x)), ) - caption_num_chars = dataframe["text"]["data"].apply(len) + caption_num_chars = dataframe["text_data"].apply(len) mask = (caption_num_words >= self.min_words_length) & ( caption_num_chars >= self.min_characters_length diff --git a/components/filter_text_length/tests/text_length_filter_test.py b/components/filter_text_length/tests/text_length_filter_test.py index eea98864e..55c927e79 100644 --- a/components/filter_text_length/tests/text_length_filter_test.py +++ b/components/filter_text_length/tests/text_length_filter_test.py @@ -24,6 +24,6 @@ def test_run_component_test(): # Then: dataframe only contains one row assert len(dataframe) == 1 assert ( - dataframe.loc[2]["text"]["data"] + dataframe.loc[2]["text_data"] == "This a valid sentence which should be still there" ) diff --git a/components/image_cropping/README.md b/components/image_cropping/README.md index 5d679c457..e59af3af6 100644 --- a/components/image_cropping/README.md +++ b/components/image_cropping/README.md @@ -22,15 +22,13 @@ right side is border-cropped image. **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/image_cropping/fondant_component.yaml b/components/image_cropping/fondant_component.yaml index 416bc2c1d..130b14324 100644 --- a/components/image_cropping/fondant_component.yaml +++ b/components/image_cropping/fondant_component.yaml @@ -20,20 +20,16 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 args: cropping_threshold: diff --git a/components/image_cropping/src/main.py b/components/image_cropping/src/main.py index c670fdeb8..6a62e309c 100644 --- a/components/image_cropping/src/main.py +++ b/components/image_cropping/src/main.py @@ -46,12 +46,12 @@ def __init__( def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: # crop images - dataframe["images"]["data"] = dataframe["images"]["data"].apply( + dataframe["images_data"] = dataframe["images_data"].apply( lambda image: remove_borders(image, self.cropping_threshold, self.padding), ) # extract width and height - dataframe["images"][["width", "height"]] = dataframe["images"]["data"].apply( + dataframe["images_width", "images_height"] = dataframe["images_data"].apply( extract_dimensions, axis=1, result_type="expand", diff --git a/components/image_resolution_extraction/README.md b/components/image_resolution_extraction/README.md index a69a4df4e..77e11742d 100644 --- a/components/image_resolution_extraction/README.md +++ b/components/image_resolution_extraction/README.md @@ -7,15 +7,13 @@ Component that extracts image resolution data from the images **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary - - width: int32 - - height: int32 +- images_data: binary +- images_width: int32 +- images_height: int32 ### Arguments diff --git a/components/image_resolution_extraction/fondant_component.yaml b/components/image_resolution_extraction/fondant_component.yaml index 1ddbf4afb..f840da680 100644 --- a/components/image_resolution_extraction/fondant_component.yaml +++ b/components/image_resolution_extraction/fondant_component.yaml @@ -5,17 +5,13 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary - width: - type: int32 - height: - type: int32 \ No newline at end of file + images_data: + type: binary + images_width: + type: int32 + images_height: + type: int32 \ No newline at end of file diff --git a/components/image_resolution_extraction/src/main.py b/components/image_resolution_extraction/src/main.py index 823b7b70f..a8715d831 100644 --- a/components/image_resolution_extraction/src/main.py +++ b/components/image_resolution_extraction/src/main.py @@ -38,8 +38,9 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: """ logger.info("Filtering dataset...") - dataframe[[("images", "width"), ("images", "height")]] = dataframe[ - [("images", "data")] - ].apply(lambda x: extract_dimensions(x.images.data), axis=1) + dataframe["images_width", "images_height"] = dataframe[["images_data"]].apply( + lambda x: extract_dimensions(x.images.data), + axis=1, + ) return dataframe diff --git a/components/index_qdrant/fondant_component.yaml b/components/index_qdrant/fondant_component.yaml index 6feb3b257..68ea33847 100644 --- a/components/index_qdrant/fondant_component.yaml +++ b/components/index_qdrant/fondant_component.yaml @@ -7,14 +7,12 @@ image: 'fndnt/index_qdrant:dev' tags: - Data writing consumes: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 + text_data: + type: string + embeddings_data: + type: array + items: + type: float32 args: collection_name: description: The name of the Qdrant collection to upsert data into. diff --git a/components/index_weaviate/README.md b/components/index_weaviate/README.md index ce4729c52..efa6286a0 100644 --- a/components/index_weaviate/README.md +++ b/components/index_weaviate/README.md @@ -7,9 +7,8 @@ Component that takes embeddings of text snippets and indexes them into a weaviat **This component consumes:** -- text - - data: string - - embedding: list +- text_data: string +- text_embedding: list **This component produces no data.** diff --git a/components/index_weaviate/fondant_component.yaml b/components/index_weaviate/fondant_component.yaml index d20d168fd..cb06ad683 100644 --- a/components/index_weaviate/fondant_component.yaml +++ b/components/index_weaviate/fondant_component.yaml @@ -5,14 +5,12 @@ tags: - Data writing consumes: - text: - fields: - data: - type: string - embedding: - type: array - items: - type: float32 + text_data: + type: string + text_embedding: + type: array + items: + type: float32 args: weaviate_url: diff --git a/components/language_filter/README.md b/components/language_filter/README.md index c3afd6435..3aebe1e26 100644 --- a/components/language_filter/README.md +++ b/components/language_filter/README.md @@ -7,8 +7,7 @@ A component that filters text based on the provided language. **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/language_filter/fondant_component.yaml b/components/language_filter/fondant_component.yaml index ab59a58be..3a98f27f7 100644 --- a/components/language_filter/fondant_component.yaml +++ b/components/language_filter/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: language: diff --git a/components/language_filter/src/main.py b/components/language_filter/src/main.py index f306512e4..4c753d1b4 100644 --- a/components/language_filter/src/main.py +++ b/components/language_filter/src/main.py @@ -38,7 +38,7 @@ def predict_lang(self, text: str): def is_language(self, row): """Predict if text of a row is written in the defined language.""" - return self.language in self.predict_lang(row["text"]) + return self.language in self.predict_lang(row["text_data"]) class LanguageFilterComponent(PandasTransformComponent): diff --git a/components/load_from_files/README.md b/components/load_from_files/README.md index 834f568e5..9a618f176 100644 --- a/components/load_from_files/README.md +++ b/components/load_from_files/README.md @@ -11,9 +11,8 @@ location. It supports the following formats: .zip, gzip, tar and tar.gz. **This component produces:** -- file - - filename: string - - content: binary +- file_filename: string +- file_content: binary ### Arguments diff --git a/components/load_from_files/fondant_component.yaml b/components/load_from_files/fondant_component.yaml index 11416e5b5..2e0167b9d 100644 --- a/components/load_from_files/fondant_component.yaml +++ b/components/load_from_files/fondant_component.yaml @@ -7,13 +7,11 @@ tags: - Data loading produces: - file: - fields: - filename: - type: string - content: - type: binary - + file_filename: + type: string + file_content: + type: binary + args: directory_uri: description: Local or remote path to the directory containing the files diff --git a/components/load_from_hf_hub/README.md b/components/load_from_hf_hub/README.md index 1faa0175a..e14e6f440 100644 --- a/components/load_from_hf_hub/README.md +++ b/components/load_from_hf_hub/README.md @@ -9,8 +9,7 @@ Component that loads a dataset from the hub **This component produces:** -- dummy_variable - - data: binary +- dummy_variable: binary ### Arguments diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index d6a625971..7e72f2b22 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data loading produces: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: #TODO: fill in here + type: binary args: dataset_name: @@ -19,10 +17,10 @@ args: type: dict default: {} image_column_names: - description: Optional argument, a list containing the original image column names in case the + description: Optional argument, a list containing the original image column names in case the dataset on the hub contains them. Used to format the image from HF hub format to a byte string. type: list - default: [] + default: [ ] n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index b978a96af..ccb2dd2ab 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -54,16 +54,12 @@ def get_columns_to_keep(self) -> t.List[str]: else: invert_column_name_mapping = {} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - column_name = f"{subset_name}_{field_name}" - if ( - invert_column_name_mapping - and column_name in invert_column_name_mapping - ): - columns.append(invert_column_name_mapping[column_name]) - else: - columns.append(column_name) + for field_name, field in self.spec.produces.items(): + column_name = field_name + if invert_column_name_mapping and column_name in invert_column_name_mapping: + columns.append(invert_column_name_mapping[column_name]) + else: + columns.append(column_name) if self.index_column is not None: columns.append(self.index_column) @@ -99,11 +95,10 @@ def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): def _get_meta_df() -> pd.DataFrame: meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) return pd.DataFrame(meta_dict).set_index("id") meta = _get_meta_df() diff --git a/components/load_from_parquet/README.md b/components/load_from_parquet/README.md index c83f7e9e8..d6bda66c3 100644 --- a/components/load_from_parquet/README.md +++ b/components/load_from_parquet/README.md @@ -9,8 +9,7 @@ Component that loads a dataset from a parquet uri **This component produces:** -- dummy_variable - - data: binary +- dummy_variable: binary ### Arguments diff --git a/components/load_from_parquet/fondant_component.yaml b/components/load_from_parquet/fondant_component.yaml index 5cc5796fa..894069c59 100644 --- a/components/load_from_parquet/fondant_component.yaml +++ b/components/load_from_parquet/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data loading produces: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: + type: binary args: dataset_uri: diff --git a/components/load_from_parquet/src/main.py b/components/load_from_parquet/src/main.py index ddd338552..117ae10ce 100644 --- a/components/load_from_parquet/src/main.py +++ b/components/load_from_parquet/src/main.py @@ -50,16 +50,12 @@ def get_columns_to_keep(self) -> t.List[str]: else: invert_column_name_mapping = {} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - column_name = f"{subset_name}_{field_name}" - if ( - invert_column_name_mapping - and column_name in invert_column_name_mapping - ): - columns.append(invert_column_name_mapping[column_name]) - else: - columns.append(column_name) + for field_name, field in self.spec.produces.items(): + column_name = field_name + if invert_column_name_mapping and column_name in invert_column_name_mapping: + columns.append(invert_column_name_mapping[column_name]) + else: + columns.append(column_name) if self.index_column is not None: columns.append(self.index_column) @@ -85,11 +81,10 @@ def _set_unique_index(dataframe: pd.DataFrame, partition_info=None): def _get_meta_df() -> pd.DataFrame: meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series( + dtype=pd.ArrowDtype(field.type.value), + ) return pd.DataFrame(meta_dict).set_index("id") meta = _get_meta_df() diff --git a/components/minhash_generator/README.md b/components/minhash_generator/README.md index 422fdc7af..5fc4cb86e 100644 --- a/components/minhash_generator/README.md +++ b/components/minhash_generator/README.md @@ -7,13 +7,11 @@ A component that generates minhashes of text. **This component consumes:** -- text - - data: string +- text_data: string **This component produces:** -- text - - minhash: list +- text_minhash: list ### Arguments diff --git a/components/minhash_generator/fondant_component.yaml b/components/minhash_generator/fondant_component.yaml index 6528112ef..1747982f8 100644 --- a/components/minhash_generator/fondant_component.yaml +++ b/components/minhash_generator/fondant_component.yaml @@ -5,18 +5,14 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string produces: - text: - fields: - minhash: - type: array - items: - type: uint64 + text_minhash: + type: array + items: + type: uint64 args: shingle_ngram_size: description: Define size of ngram used for the shingle generation diff --git a/components/minhash_generator/src/main.py b/components/minhash_generator/src/main.py index c8034334b..f61e34fcb 100644 --- a/components/minhash_generator/src/main.py +++ b/components/minhash_generator/src/main.py @@ -51,10 +51,10 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Returns: Pandas dataframe """ - dataframe[("text", "shingles")] = dataframe[("text", "data")].apply( + dataframe["text_shingles"] = dataframe["text_data"].apply( create_shingles, ) - dataframe[("text", "minhash")] = dataframe[("text", "shingles")].apply( + dataframe["text_minhash"] = dataframe["text_shingles"].apply( compute_minhash, ) diff --git a/components/normalize_text/README.md b/components/normalize_text/README.md index edc955a79..3609ba0de 100644 --- a/components/normalize_text/README.md +++ b/components/normalize_text/README.md @@ -19,8 +19,7 @@ the training of large language models. **This component consumes:** -- text - - data: string +- text_data: string **This component produces no data.** diff --git a/components/normalize_text/fondant_component.yaml b/components/normalize_text/fondant_component.yaml index d6551f578..fd9cfc4cb 100644 --- a/components/normalize_text/fondant_component.yaml +++ b/components/normalize_text/fondant_component.yaml @@ -17,10 +17,8 @@ tags: - Text processing consumes: - text: - fields: - data: - type: string + text_data: + type: string args: remove_additional_whitespaces: diff --git a/components/normalize_text/src/main.py b/components/normalize_text/src/main.py index 47220fba4..a98b7b36b 100644 --- a/components/normalize_text/src/main.py +++ b/components/normalize_text/src/main.py @@ -89,31 +89,31 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: Pandas dataframe """ if self.normalize_lines: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( normalize_lines, ) if self.do_lowercase: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( lambda x: x.lower(), ) if self.apply_nfc: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( self._do_nfc_normalization, ) if self.remove_punctuation: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( _remove_punctuation, ) if self.remove_additional_whitespaces: - dataframe[("text", "data")] = dataframe[("text", "data")].apply( + dataframe["text_data"] = dataframe["text_data"].apply( _remove_additional_whitespaces, ) # remove all empty rows - dataframe = dataframe[dataframe[("text", "data")].astype(bool)] + dataframe = dataframe[dataframe["text_data"].astype(bool)] return dataframe diff --git a/components/prompt_based_laion_retrieval/Dockerfile b/components/prompt_based_laion_retrieval/Dockerfile index 72525d884..0cdcde81a 100644 --- a/components/prompt_based_laion_retrieval/Dockerfile +++ b/components/prompt_based_laion_retrieval/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.8-slim +FROM --platform=linux/amd64 python:3.8-slim as base # System dependencies RUN apt-get update && \ @@ -16,8 +16,15 @@ RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team # Set the working directory to the component folder WORKDIR /component/src +COPY src/ src/ +ENV PYTHONPATH "${PYTHONPATH}:./src" -# Copy over src-files -COPY src/ . +FROM base as test +COPY test_requirements.txt . +RUN pip3 install --no-cache-dir -r test_requirements.txt +COPY tests/ tests/ +RUN python -m pytest tests -ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file +FROM base +WORKDIR /component/src +ENTRYPOINT ["fondant", "execute", "main"] diff --git a/components/prompt_based_laion_retrieval/README.md b/components/prompt_based_laion_retrieval/README.md index af43a9826..0551730d9 100644 --- a/components/prompt_based_laion_retrieval/README.md +++ b/components/prompt_based_laion_retrieval/README.md @@ -12,13 +12,12 @@ This component doesn’t return the actual images, only URLs. **This component consumes:** -- prompts - - text: string +- prompts_text: string **This component produces:** -- images - - url: string +- images_url: string +- prompt_id: string ### Arguments @@ -52,3 +51,9 @@ prompt_based_laion_retrieval_op = ComponentOp.from_registry( pipeline.add_op(prompt_based_laion_retrieval_op, dependencies=[...]) #Add previous component as dependency ``` +### Testing + +You can run the tests using docker with BuildKit. From this directory, run: +``` +docker build . --target test +``` diff --git a/components/prompt_based_laion_retrieval/fondant_component.yaml b/components/prompt_based_laion_retrieval/fondant_component.yaml index fdd7589dc..3ac3604ac 100644 --- a/components/prompt_based_laion_retrieval/fondant_component.yaml +++ b/components/prompt_based_laion_retrieval/fondant_component.yaml @@ -10,17 +10,16 @@ tags: - Data retrieval consumes: - prompts: - fields: - text: - type: string + prompts_text: + type: string produces: - images: - fields: - url: - type: string - additionalSubsets: false + images_url: + type: string + prompt_id: + type: string + +previous_index: prompt_id args: num_images: diff --git a/components/prompt_based_laion_retrieval/src/main.py b/components/prompt_based_laion_retrieval/src/main.py index c9459060f..bd3cee783 100644 --- a/components/prompt_based_laion_retrieval/src/main.py +++ b/components/prompt_based_laion_retrieval/src/main.py @@ -41,6 +41,10 @@ def __init__( modality=Modality.IMAGE, ) + def query(self, id_: t.Any, prompt: str) -> t.List[t.Dict]: + results = self.client.query(text=prompt) + return [dict(d, prompt_id=id_) for d in results] + def transform( self, dataframe: pd.DataFrame, @@ -53,21 +57,20 @@ async def async_query(): futures = [ loop.run_in_executor( executor, - self.client.query, - prompt, + self.query, + row.id, + row.prompts_text, ) - for prompt in dataframe["prompts"]["text"] + for row in dataframe.itertuples() ] for response in await asyncio.gather(*futures): results.extend(response) loop.run_until_complete(async_query()) - results_df = pd.DataFrame(results)[["id", "url"]] + results_df = pd.DataFrame(results)[["id", "url", "prompt_id"]] results_df = results_df.set_index("id") - # Cast the index to string - results_df.index = results_df.index.astype(str) - results_df.columns = [["images"], ["url"]] + results_df.rename(columns={"url": "images_url"}) return results_df diff --git a/components/prompt_based_laion_retrieval/test_requirements.txt b/components/prompt_based_laion_retrieval/test_requirements.txt new file mode 100644 index 000000000..2a929edcc --- /dev/null +++ b/components/prompt_based_laion_retrieval/test_requirements.txt @@ -0,0 +1 @@ +pytest==7.4.2 diff --git a/components/prompt_based_laion_retrieval/tests/pytest.ini b/components/prompt_based_laion_retrieval/tests/pytest.ini new file mode 100644 index 000000000..bf6a8a517 --- /dev/null +++ b/components/prompt_based_laion_retrieval/tests/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = ../src \ No newline at end of file diff --git a/components/prompt_based_laion_retrieval/tests/test_component.py b/components/prompt_based_laion_retrieval/tests/test_component.py new file mode 100644 index 000000000..7a3a268e6 --- /dev/null +++ b/components/prompt_based_laion_retrieval/tests/test_component.py @@ -0,0 +1,66 @@ +import typing as t + +import pandas as pd + +from src.main import LAIONRetrievalComponent + + +def test_component(monkeypatch): + def mocked_client_query(text: str) -> t.List[dict]: + if text == "first prompt": + return [ + { + "id": "a", + "url": "http://a", + }, + { + "id": "b", + "url": "http://b", + }, + ] + if text == "second prompt": + return [ + { + "id": "c", + "url": "http://c", + }, + { + "id": "d", + "url": "http://d", + }, + ] + msg = f"Unexpected value: `text` was {text}" + raise ValueError(msg) + + input_dataframe = pd.DataFrame.from_dict( + { + "id": ["1", "2"], + "prompts_text": ["first prompt", "second prompt"], + }, + ) + + expected_output_dataframe = pd.DataFrame.from_dict( + { + "id": ["a", "b", "c", "d"], + "url": ["http://a", "http://b", "http://c", "http://d"], + "prompt_id": ["1", "1", "2", "2"], + }, + ) + expected_output_dataframe = expected_output_dataframe.set_index("id") + + component = LAIONRetrievalComponent( + num_images=2, + aesthetic_score=9, + aesthetic_weight=0.5, + url="", + ) + + monkeypatch.setattr(component.client, "query", mocked_client_query) + + output_dataframe = component.transform(input_dataframe) + + pd.testing.assert_frame_equal( + left=expected_output_dataframe, + right=output_dataframe, + check_dtype=False, + ) diff --git a/components/resize_images/README.md b/components/resize_images/README.md index 593b2ca76..89561e7a5 100644 --- a/components/resize_images/README.md +++ b/components/resize_images/README.md @@ -7,13 +7,11 @@ Component that resizes images based on given width and height **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- images - - data: binary +- images_data: binary ### Arguments diff --git a/components/resize_images/fondant_component.yaml b/components/resize_images/fondant_component.yaml index 6ab866d12..6112815c4 100644 --- a/components/resize_images/fondant_component.yaml +++ b/components/resize_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - images: - fields: - data: - type: binary + images_data: + type: binary args: resize_width: diff --git a/components/resize_images/src/main.py b/components/resize_images/src/main.py index 434dd29db..d5d4207bb 100644 --- a/components/resize_images/src/main.py +++ b/components/resize_images/src/main.py @@ -29,6 +29,6 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: axis=1, ) - dataframe[("images", "data")] = result + dataframe["images_data"] = result return dataframe diff --git a/components/segment_images/README.md b/components/segment_images/README.md index 9f475d516..418eacb13 100644 --- a/components/segment_images/README.md +++ b/components/segment_images/README.md @@ -7,13 +7,11 @@ Component that creates segmentation masks for images using a model from the Hugg **This component consumes:** -- images - - data: binary +- images_data: binary **This component produces:** -- segmentations - - data: binary +- segmentations_data: binary ### Arguments diff --git a/components/segment_images/fondant_component.yaml b/components/segment_images/fondant_component.yaml index fca45e541..34fbd9fcd 100644 --- a/components/segment_images/fondant_component.yaml +++ b/components/segment_images/fondant_component.yaml @@ -5,16 +5,12 @@ tags: - Image processing consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - segmentations: - fields: - data: - type: binary + segmentations_data: + type: binary args: model_id: diff --git a/components/segment_images/src/main.py b/components/segment_images/src/main.py index 0f8f46faa..4e06c5d89 100644 --- a/components/segment_images/src/main.py +++ b/components/segment_images/src/main.py @@ -150,4 +150,4 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: results.append(segmentations) - return pd.concat(results).to_frame(name=("segmentations", "data")) + return pd.concat(results).to_frame(name=("segmentations_data")) diff --git a/components/write_to_hf_hub/README.md b/components/write_to_hf_hub/README.md index 54978470a..ec80bf334 100644 --- a/components/write_to_hf_hub/README.md +++ b/components/write_to_hf_hub/README.md @@ -7,8 +7,7 @@ Component that writes a dataset to the hub **This component consumes:** -- dummy_variable - - data: binary +- dummy_variable: binary **This component produces no data.** diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml index 363f2507c..b4391fbbc 100644 --- a/components/write_to_hf_hub/fondant_component.yaml +++ b/components/write_to_hf_hub/fondant_component.yaml @@ -5,10 +5,8 @@ tags: - Data writing consumes: - dummy_variable: #TODO: fill in here - fields: - data: - type: binary + dummy_variable: + type: binary args: hf_token: @@ -23,7 +21,7 @@ args: image_column_names: description: A list containing the image column names. Used to format to image to HF hub format type: list - default: [] + default: [ ] column_name_mapping: description: Mapping of the consumed fondant column names to the written hub column names type: dict diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py index 0ed01b961..6d464f0f2 100644 --- a/components/write_to_hf_hub/src/main.py +++ b/components/write_to_hf_hub/src/main.py @@ -74,16 +74,15 @@ def write( # Get columns to write and schema write_columns = [] schema_dict = {} - for subset_name, subset in self.spec.consumes.items(): - for field in subset.fields.values(): - column_name = f"{subset_name}_{field.name}" - write_columns.append(column_name) - if self.image_column_names and column_name in self.image_column_names: - schema_dict[column_name] = datasets.Image() - else: - schema_dict[column_name] = generate_from_arrow_type( - field.type.value, - ) + for field_name, field in self.spec.consumes.items(): + column_name = field.name + write_columns.append(column_name) + if self.image_column_names and column_name in self.image_column_names: + schema_dict[column_name] = datasets.Image() + else: + schema_dict[column_name] = generate_from_arrow_type( + field.type.value, + ) schema = datasets.Features(schema_dict).arrow_schema dataframe = dataframe[write_columns] diff --git a/scripts/component_readme/readme_template.md b/scripts/component_readme/readme_template.md index 1266b56d3..54ad2e417 100644 --- a/scripts/component_readme/readme_template.md +++ b/scripts/component_readme/readme_template.md @@ -8,11 +8,8 @@ {% if consumes %} **This component consumes:** -{% for subset_name, subset in consumes.items() %} -- {{ subset_name }} -{% for field in subset.fields.values() %} - - {{ field.name }}: {{ field.type.value }} -{% endfor %} +{% for field_name, field in consumes.items() %} +- {{ field.name }}: {{ field.type.value }} {% endfor %} {% else %} **This component consumes no data.** @@ -21,11 +18,8 @@ {% if produces %} **This component produces:** -{% for subset_name, subset in produces.items() %} -- {{ subset_name }} -{% for field in subset.fields.values() %} - - {{ field.name }}: {{ field.type.value }} -{% endfor %} +{% for field_name, field in produces.items() %} +- {{ field.name }}: {{ field.type.value }} {% endfor %} {% else %} **This component produces no data.** diff --git a/src/fondant/component/data_io.py b/src/fondant/component/data_io.py index 7023c1ee2..79a181f8d 100644 --- a/src/fondant/component/data_io.py +++ b/src/fondant/component/data_io.py @@ -1,16 +1,19 @@ import logging import os import typing as t +from collections import defaultdict import dask.dataframe as dd from dask.diagnostics import ProgressBar from dask.distributed import Client -from fondant.core.component_spec import ComponentSpec, ComponentSubset +from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest logger = logging.getLogger(__name__) +DEFAULT_INDEX_NAME = "id" + class DataIO: def __init__(self, *, manifest: Manifest, component_spec: ComponentSpec) -> None: @@ -82,73 +85,48 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: return dataframe - def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame: + def load_dataframe(self) -> dd.DataFrame: """ - Function that loads a subset from the manifest as a Dask dataframe. - - Args: - subset_name: the name of the subset to load - fields: the fields to load from the subset + Function that loads the subsets defined in the component spec as a single Dask dataframe for + the user. Returns: - The subset as a dask dataframe + The Dask dataframe with all columns defined in the manifest field mapping """ - subset = self.manifest.subsets[subset_name] - remote_path = subset.location - - logger.info(f"Loading subset {subset_name} with fields {fields}...") + dataframe = None + field_mapping = defaultdict(list) - subset_df = dd.read_parquet( - remote_path, - columns=fields, - calculate_divisions=True, + # Add index field to field mapping to guarantee start reading with the index dataframe + field_mapping[self.manifest.get_field_location(DEFAULT_INDEX_NAME)].append( + DEFAULT_INDEX_NAME, ) - # add subset prefix to columns - subset_df = subset_df.rename( - columns={col: subset_name + "_" + col for col in subset_df.columns}, - ) + for field_name in self.component_spec.consumes: + location = self.manifest.get_field_location(field_name) + field_mapping[location].append(field_name) - return subset_df - - def _load_index(self) -> dd.DataFrame: - """ - Function that loads the index from the manifest as a Dask dataframe. - - Returns: - The index as a dask dataframe - """ - # get index subset from the manifest - index = self.manifest.index - # get remote path - remote_path = index.location - - # load index from parquet, expecting id and source columns - return dd.read_parquet(remote_path, calculate_divisions=True) - - def load_dataframe(self) -> dd.DataFrame: - """ - Function that loads the subsets defined in the component spec as a single Dask dataframe for - the user. + for location, fields in field_mapping.items(): + if DEFAULT_INDEX_NAME in fields: + fields.remove(DEFAULT_INDEX_NAME) - Returns: - The Dask dataframe with the field columns in the format (_) - as well as the index columns. - """ - # load index into dataframe - dataframe = self._load_index() - for name, subset in self.component_spec.consumes.items(): - fields = list(subset.fields.keys()) - subset_df = self._load_subset(name, fields) - # left joins -> filter on index - dataframe = dd.merge( - dataframe, - subset_df, - left_index=True, - right_index=True, - how="left", + partial_df = dd.read_parquet( + location, + columns=fields, + index=DEFAULT_INDEX_NAME, + calculate_divisions=True, ) + if dataframe is None: + # ensure that the index is set correctly and divisions are known. + dataframe = partial_df + else: + dataframe = dataframe.merge( + partial_df, + how="left", + left_index=True, + right_index=True, + ) + dataframe = self.partition_loaded_dataframe(dataframe) logging.info(f"Columns of dataframe: {list(dataframe.columns)}") @@ -170,79 +148,48 @@ def write_dataframe( dataframe: dd.DataFrame, dask_client: t.Optional[Client] = None, ) -> None: - write_tasks = [] + columns_to_produce = [ + column_name for column_name, field in self.component_spec.produces.items() + ] - dataframe.index = dataframe.index.rename("id") + dataframe.index = dataframe.index.rename(DEFAULT_INDEX_NAME) - # Turn index into an empty dataframe so we can write it - index_df = dataframe.index.to_frame().drop(columns=["id"]) - write_index_task = self._write_subset( - index_df, - subset_name="index", - subset_spec=self.component_spec.index, - ) - write_tasks.append(write_index_task) + # validation that all columns are in the dataframe + self.validate_dataframe_columns(dataframe, columns_to_produce) - for subset_name, subset_spec in self.component_spec.produces.items(): - subset_df = self._extract_subset_dataframe( - dataframe, - subset_name=subset_name, - subset_spec=subset_spec, - ) - write_subset_task = self._write_subset( - subset_df, - subset_name=subset_name, - subset_spec=subset_spec, - ) - write_tasks.append(write_subset_task) + dataframe = dataframe[columns_to_produce] + write_task = self._write_dataframe(dataframe) with ProgressBar(): logging.info("Writing data...") - # alternative implementation possible: futures = client.compute(...) - dd.compute(*write_tasks, scheduler=dask_client) + dd.compute(write_task, scheduler=dask_client) @staticmethod - def _extract_subset_dataframe( - dataframe: dd.DataFrame, - *, - subset_name: str, - subset_spec: ComponentSubset, - ) -> dd.DataFrame: - """Create subset dataframe to save with the original field name as the column name.""" - # Create a new dataframe with only the columns needed for the output subset - subset_columns = [f"{subset_name}_{field}" for field in subset_spec.fields] - try: - subset_df = dataframe[subset_columns] - except KeyError as e: + def validate_dataframe_columns(dataframe: dd.DataFrame, columns: t.List[str]): + """Validates that all columns are available in the dataset.""" + missing_fields = [] + for col in columns: + if col not in dataframe.columns: + missing_fields.append(col) + + if missing_fields: msg = ( - f"Field {e.args[0]} defined in output subset {subset_name} " + f"Fields {missing_fields} defined in output dataset " f"but not found in dataframe" ) raise ValueError( msg, ) - # Remove the subset prefix from the column names - subset_df = subset_df.rename( - columns={col: col[(len(f"{subset_name}_")) :] for col in subset_columns}, + def _write_dataframe(self, dataframe: dd.DataFrame) -> dd.core.Scalar: + """Create dataframe writing task.""" + location = ( + self.manifest.base_path + "/" + self.component_spec.component_folder_name ) - - return subset_df - - def _write_subset( - self, - dataframe: dd.DataFrame, - *, - subset_name: str, - subset_spec: ComponentSubset, - ) -> dd.core.Scalar: - if subset_name == "index": - location = self.manifest.index.location - else: - location = self.manifest.subsets[subset_name].location - - schema = {field.name: field.type.value for field in subset_spec.fields.values()} - + schema = { + field.name: field.type.value + for field in self.component_spec.produces.values() + } return self._create_write_task(dataframe, location=location, schema=schema) @staticmethod diff --git a/src/fondant/component/executor.py b/src/fondant/component/executor.py index 3d4d6097f..571bc60bb 100644 --- a/src/fondant/component/executor.py +++ b/src/fondant/component/executor.py @@ -491,14 +491,11 @@ def optional_fondant_arguments() -> t.List[str]: @staticmethod def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """Factory that creates a function to wrap the component transform function. The wrapper: - - Converts the columns to hierarchical format before passing the dataframe to the - transform function - Removes extra columns from the returned dataframe which are not defined in the component spec `produces` section - Sorts the columns from the returned dataframe according to the order in the component spec `produces` section to match the order in the `meta` argument passed to Dask's `map_partitions`. - - Flattens the returned dataframe columns. Args: transform: Transform method to wrap @@ -506,27 +503,13 @@ def wrap_transform(transform: t.Callable, *, spec: ComponentSpec) -> t.Callable: """ def wrapped_transform(dataframe: pd.DataFrame) -> pd.DataFrame: - # Switch to hierarchical columns - dataframe.columns = pd.MultiIndex.from_tuples( - tuple(column.split("_")) for column in dataframe.columns - ) - # Call transform method dataframe = transform(dataframe) # Drop columns not in specification - columns = [ - (subset_name, field) - for subset_name, subset in spec.produces.items() - for field in subset.fields - ] - dataframe = dataframe[columns] - - # Switch to flattened columns - dataframe.columns = [ - "_".join(column) for column in dataframe.columns.to_flat_index() - ] - return dataframe + columns = [name for name, field in spec.produces.items()] + + return dataframe[columns] return wrapped_transform @@ -552,11 +535,8 @@ def _execute_component( # Create meta dataframe with expected format meta_dict = {"id": pd.Series(dtype="object")} - for subset_name, subset in self.spec.produces.items(): - for field_name, field in subset.fields.items(): - meta_dict[f"{subset_name}_{field_name}"] = pd.Series( - dtype=pd.ArrowDtype(field.type.value), - ) + for field_name, field in self.spec.produces.items(): + meta_dict[field_name] = pd.Series(dtype=pd.ArrowDtype(field.type.value)) meta_df = pd.DataFrame(meta_dict).set_index("id") wrapped_transform = self.wrap_transform(component.transform, spec=self.spec) @@ -568,24 +548,11 @@ def _execute_component( ) # Clear divisions if component spec indicates that the index is changed - if self._infer_index_change(): + if self.spec.previous_index is not None: dataframe.clear_divisions() return dataframe - def _infer_index_change(self) -> bool: - """Infer if this component changes the index based on its component spec.""" - if not self.spec.accepts_additional_subsets: - return True - if not self.spec.outputs_additional_subsets: - return True - for subset in self.spec.consumes.values(): - if not subset.additional_fields: - return True - return any( - not subset.additional_fields for subset in self.spec.produces.values() - ) - class DaskWriteExecutor(Executor[DaskWriteComponent]): """Base class for a Fondant write component.""" diff --git a/src/fondant/core/component_spec.py b/src/fondant/core/component_spec.py index cf177e07c..1700e10a1 100644 --- a/src/fondant/core/component_spec.py +++ b/src/fondant/core/component_spec.py @@ -66,34 +66,6 @@ def kubeflow_type(self) -> str: return lookup[self.type] -class ComponentSubset: - """ - Class representing a Fondant Component subset. - - Args: - specification: the part of the component json representing the subset - """ - - def __init__(self, specification: t.Dict[str, t.Any]) -> None: - self._specification = specification - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._specification!r})" - - @property - def fields(self) -> t.Mapping[str, Field]: - return types.MappingProxyType( - { - name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["fields"].items() - }, - ) - - @property - def additional_fields(self) -> bool: - return self._specification.get("additionalFields", True) - - class ComponentSpec: """ Class representing a Fondant component specification. @@ -190,38 +162,28 @@ def tags(self) -> t.List[str]: return self._specification.get("tags", None) @property - def index(self): - return ComponentSubset({"fields": {}}) - - @property - def consumes(self) -> t.Mapping[str, ComponentSubset]: - """The subsets consumed by the component as an immutable mapping.""" + def consumes(self) -> t.Mapping[str, Field]: + """The fields consumed by the component as an immutable mapping.""" return types.MappingProxyType( { - name: ComponentSubset(subset) - for name, subset in self._specification.get("consumes", {}).items() - if name != "additionalSubsets" + name: Field(name=name, type=Type.from_json(field)) + for name, field in self._specification.get("consumes", {}).items() }, ) @property - def produces(self) -> t.Mapping[str, ComponentSubset]: - """The subsets produced by the component as an immutable mapping.""" + def produces(self) -> t.Mapping[str, Field]: + """The fields produced by the component as an immutable mapping.""" return types.MappingProxyType( { - name: ComponentSubset(subset) - for name, subset in self._specification.get("produces", {}).items() - if name != "additionalSubsets" + name: Field(name=name, type=Type.from_json(field)) + for name, field in self._specification.get("produces", {}).items() }, ) @property - def accepts_additional_subsets(self) -> bool: - return self._specification.get("consumes", {}).get("additionalSubsets", True) - - @property - def outputs_additional_subsets(self) -> bool: - return self._specification.get("produces", {}).get("additionalSubsets", True) + def previous_index(self) -> t.Optional[str]: + return self._specification.get("previous_index") @property def args(self) -> t.Mapping[str, Argument]: diff --git a/src/fondant/core/manifest.py b/src/fondant/core/manifest.py index 692c4e7cd..4f0aab480 100644 --- a/src/fondant/core/manifest.py +++ b/src/fondant/core/manifest.py @@ -18,59 +18,6 @@ from fondant.core.schema import Field, Type -class Subset: - """ - Class representing a Fondant subset. - - Args: - specification: The part of the manifest json representing the subset - base_path: The base path which the subset location is defined relative to - """ - - def __init__(self, specification: dict, *, base_path: str) -> None: - self._specification = specification - self._base_path = base_path - - @property - def location(self) -> str: - """The absolute location of the subset.""" - return self._base_path + self._specification["location"] - - @property - def fields(self) -> t.Mapping[str, Field]: - """The fields of the subset returned as an immutable mapping.""" - return types.MappingProxyType( - { - name: Field(name=name, type=Type.from_json(field)) - for name, field in self._specification["fields"].items() - }, - ) - - def add_field(self, name: str, type_: Type, *, overwrite: bool = False) -> None: - if not overwrite and name in self._specification["fields"]: - msg = f"A field with name {name} already exists" - raise ValueError(msg) - - self._specification["fields"][name] = type_.to_json() - - def remove_field(self, name: str) -> None: - del self._specification["fields"][name] - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self._specification!r})" - - -class Index(Subset): - """Special case of a subset for the index, which has fixed fields.""" - - @property - def fields(self) -> t.Dict[str, Field]: - return { - "id": Field(name="id", type=Type("string")), - "source": Field(name="source", type=Type("string")), - } - - @dataclass class Metadata: """ @@ -171,8 +118,8 @@ def create( specification = { "metadata": metadata.to_dict(), - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, - "subsets": {}, + "index": {"location": f"/{component_id}"}, + "fields": {}, } return cls(specification) @@ -196,6 +143,10 @@ def copy(self) -> "Manifest": def metadata(self) -> t.Dict[str, t.Any]: return self._specification["metadata"] + @property + def index(self) -> Field: + return Field(name="id", location=self._specification["index"]["location"]) + def update_metadata(self, key: str, value: t.Any) -> None: self.metadata[key] = value @@ -203,6 +154,17 @@ def update_metadata(self, key: str, value: t.Any) -> None: def base_path(self) -> str: return self.metadata["base_path"] + def get_field_location(self, field_name: str): + """Return absolute path to the field location.""" + if field_name == "id": + return f"{self.base_path}/{self.pipeline_name}/{self.run_id}{self.index.location}" + if field_name not in self.fields: + msg = f"Field {field_name} is not available in the manifest." + raise ValueError(msg) + + field = self.fields[field_name] + return f"{self.base_path}/{self.pipeline_name}/{self.run_id}{field.location}" + @property def run_id(self) -> str: return self.metadata["run_id"] @@ -220,41 +182,63 @@ def cache_key(self) -> str: return self.metadata["cache_key"] @property - def index(self) -> Index: - return Index(self._specification["index"], base_path=self.base_path) - - @property - def subsets(self) -> t.Mapping[str, Subset]: - """The subsets of the manifest as an immutable mapping.""" + def fields(self) -> t.Mapping[str, Field]: + """The fields of the manifest as an immutable mapping.""" return types.MappingProxyType( { - name: Subset(subset, base_path=self.base_path) - for name, subset in self._specification["subsets"].items() + name: Field( + name=name, + type=Type.from_json(field), + location=field["location"], + ) + for name, field in self._specification["fields"].items() }, ) - def add_subset( - self, - name: str, - fields: t.Iterable[t.Union[Field, t.Tuple[str, Type]]], - ) -> None: - if name in self._specification["subsets"]: - msg = f"A subset with name {name} already exists" + def add_or_update_field(self, field: Field, overwrite: bool = False): + """Add or update field to manifest.""" + if field.name == "index": + self._add_or_update_index(field, overwrite=True) + elif overwrite is False and field.name in self._specification["fields"]: + msg = ( + f"A field with name {field.name} already exists. Set overwrite to true, " + f"if you want to update the field." + ) + raise ValueError(msg) + else: + self._specification["fields"][field.name] = { + "location": f"/{self.component_id}", + **field.type.to_json(), + } + + def _add_or_update_index(self, field: Field, overwrite: bool = True): + """Add or update the manifest index.""" + if overwrite is False: + msg = ( + "The index already exists. Set overwrite to true, " + "if you want to update the index." + ) + raise ValueError(msg) + + if field.name != "index": + msg = ( + f"The field name is {field.name}. If you try to update the index, " # nosec B608 + f"set the field name to `index`." + ) raise ValueError(msg) - self._specification["subsets"][name] = { - "location": f"/{self.pipeline_name}/{self.run_id}/{self.component_id}/{name}", - "fields": {name: type_.to_json() for name, type_ in fields}, + self._specification["index"] = { + "location": f"/{field.location}", } - def remove_subset(self, name: str) -> None: - if name not in self._specification["subsets"]: - msg = f"Subset {name} not found in specification" + def remove_field(self, name: str) -> None: + if name not in self._specification["fields"]: + msg = f"Field {name} not found in specification" raise ValueError(msg) - del self._specification["subsets"][name] + del self._specification["fields"][name] - def evolve( # noqa : PLR0912 (too many branches) + def evolve( # : PLR0912 (too many branches) self, component_spec: ComponentSpec, *, @@ -274,68 +258,26 @@ def evolve( # noqa : PLR0912 (too many branches) # Update `component_id` of the metadata component_id = component_spec.component_folder_name evolved_manifest.update_metadata(key="component_id", value=component_id) + if run_id is not None: evolved_manifest.update_metadata(key="run_id", value=run_id) - # Update index location as this is currently always rewritten - evolved_manifest.index._specification[ - "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}/index" - - # If additionalSubsets is False in consumes, - # Remove all subsets from the manifest that are not listed - if not component_spec.accepts_additional_subsets: - for subset_name in evolved_manifest.subsets: - if subset_name not in component_spec.consumes: - evolved_manifest.remove_subset(subset_name) - - # If additionalSubsets is False in produces, - # Remove all subsets from the manifest that are not listed - if not component_spec.outputs_additional_subsets: - for subset_name in evolved_manifest.subsets: - if subset_name not in component_spec.produces: - evolved_manifest.remove_subset(subset_name) - - # If additionalFields is False for a consumed subset, - # Remove all fields from that subset that are not listed - for subset_name, subset in component_spec.consumes.items(): - if subset_name in evolved_manifest.subsets and not subset.additional_fields: - for field_name in evolved_manifest.subsets[subset_name].fields: - if field_name not in subset.fields: - evolved_manifest.subsets[subset_name].remove_field( - field_name, - ) - - # For each output subset defined in the component, add or update it - for subset_name, subset in component_spec.produces.items(): - # Subset is already in manifest, update it - if subset_name in evolved_manifest.subsets: - # If additional fields are not allowed, remove the fields not defined in the - # component spec produces section - if not subset.additional_fields: - for field_name in evolved_manifest.subsets[subset_name].fields: - if field_name not in subset.fields: - evolved_manifest.subsets[subset_name].remove_field( - field_name, - ) - - # Add fields defined in the component spec produces section - # Overwrite to persist changes to the field (eg. type of column) - for field in subset.fields.values(): - evolved_manifest.subsets[subset_name].add_field( - field.name, - field.type, - overwrite=True, - ) - - # Update subset location as this is currently always rewritten - evolved_manifest.subsets[subset_name]._specification[ - "location" - ] = f"/{self.pipeline_name}/{evolved_manifest.run_id}/{component_id}/{subset_name}" - - # Subset is not yet in manifest, add it - else: - evolved_manifest.add_subset(subset_name, subset.fields.values()) + # Update index location as this is always rewritten + evolved_manifest.add_or_update_field( + Field(name="index", location=component_spec.component_folder_name), + ) + + # Remove all previous fields if the component changes the index + if component_spec.previous_index: + for field_name in evolved_manifest.fields: + evolved_manifest.remove_field(field_name) + + # Add or update all produced fields defined in the component spec + for name, field in component_spec.produces.items(): + # If field was not part of the input manifest, add field to output manifest. + # If field was part of the input manifest and got produced by the component, update + # the manifest field. + evolved_manifest.add_or_update_field(field, overwrite=True) return evolved_manifest diff --git a/src/fondant/core/schema.py b/src/fondant/core/schema.py index ca9bb0944..2599b5de1 100644 --- a/src/fondant/core/schema.py +++ b/src/fondant/core/schema.py @@ -161,11 +161,33 @@ def __eq__(self, other): return False -class Field(t.NamedTuple): - """Class representing a single field or column in a Fondant subset.""" +class Field: + """Class representing a single field or column in a Fondant dataset.""" + + def __init__( + self, + name: str, + type: Type = Type("null"), + location: str = "", + ) -> None: + self._name = name + self._type = type + self._location = location - name: str - type: Type + @property + def name(self) -> str: + """The name of the field.""" + return self._name + + @property + def type(self) -> Type: + """The absolute location of the field.""" + return self._type + + @property + def location(self) -> str: + """The relative location of the field.""" + return self._location def validate_partition_size(arg_value): diff --git a/src/fondant/core/schemas/component_spec.json b/src/fondant/core/schemas/component_spec.json index 8d684a3e5..dfa6bf68c 100644 --- a/src/fondant/core/schemas/component_spec.json +++ b/src/fondant/core/schemas/component_spec.json @@ -28,44 +28,19 @@ } }, "consumes": { - "$ref": "#/definitions/subsets" + "$ref": "common.json#/definitions/fields" }, "produces": { - "$ref": "#/definitions/subsets" + "$ref": "common.json#/definitions/fields" + }, + "previous_index": { + "type": "string" }, "args": { "$ref": "#/definitions/args" } }, "definitions": { - "subset": { - "type": "object", - "properties": { - "fields": { - "$ref": "common.json#/definitions/fields" - }, - "additionalFields": { - "type": "boolean", - "default": true - } - }, - "required": [ - "fields" - ] - }, - "subsets": { - "type": "object", - "properties": { - "additionalSubsets": { - "type": "boolean", - "default": true - } - }, - "minProperties": 1, - "additionalProperties": { - "$ref": "#/definitions/subset" - } - }, "args": { "type": "object", "minProperties": 1, diff --git a/src/fondant/core/schemas/manifest.json b/src/fondant/core/schemas/manifest.json index 00ad6d1cc..77365dd5f 100644 --- a/src/fondant/core/schemas/manifest.json +++ b/src/fondant/core/schemas/manifest.json @@ -37,36 +37,33 @@ "location" ] }, - "subsets": { - "$ref": "#/definitions/subsets" + "fields": { + "$ref": "#/definitions/fields" } }, "required": [ "metadata", "index", - "subsets" + "fields" ], "definitions": { - "subset": { + "field": { "type": "object", "properties": { "location": { "type": "string", "pattern": "/.*" - }, - "fields": { - "$ref": "common.json#/definitions/fields" } }, "required": [ "location", - "fields" + "type" ] }, - "subsets": { + "fields": { "type": "object", "additionalProperties": { - "$ref": "#/definitions/subset" + "$ref": "#/definitions/field" } } } diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index 36f81b7db..05be61c17 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -443,13 +443,13 @@ def _validate_pipeline_definition(self, run_id: str): if not load_component: # Check subset exists for ( - component_subset_name, - component_subset, + component_field_name, + component_field, ) in component_spec.consumes.items(): - if component_subset_name not in manifest.subsets: + if component_field_name not in manifest.fields: msg = ( - f"Component '{component_spec.name}' is trying to invoke the subset " - f"'{component_subset_name}', which has not been defined or created " + f"Component '{component_spec.name}' is trying to invoke the field " + f"'{component_field_name}', which has not been defined or created " f"in the previous components." ) raise InvalidPipelineDefinition( @@ -457,36 +457,22 @@ def _validate_pipeline_definition(self, run_id: str): ) # Get the corresponding manifest fields - manifest_fields = manifest.subsets[component_subset_name].fields - - # Check fields - for field_name, subset_field in component_subset.fields.items(): - # Check if invoked field exists - if field_name not in manifest_fields: - msg = ( - f"The invoked subset '{component_subset_name}' of the " - f"'{component_spec.name}' component does not match the " - f"previously created subset definition.\n The component is " - f"trying to invoke the field '{field_name}' which has not been " - f"previously defined. Current available fields are " - f"{manifest_fields}\n" - ) - raise InvalidPipelineDefinition( - msg, - ) - # Check if the invoked field schema matches the current schema - if subset_field != manifest_fields[field_name]: - msg = ( - f"The invoked subset '{component_subset_name}' of the " - f"'{component_spec.name}' component does not match the " - f"previously created subset definition.\n The '{field_name}' " - f"field is currently defined with the following schema:\n" - f"{manifest_fields[field_name]}\nThe current component to " - f"trying to invoke it with this schema:\n{subset_field}" - ) - raise InvalidPipelineDefinition( - msg, - ) + manifest_field = manifest.fields[component_field_name] + + # Check if the invoked field schema matches the current schema + if component_field.type != manifest_field.type: + msg = ( + f"The invoked field '{component_field_name}' of the " + f"'{component_spec.name}' component does not match the " + f"previously created field type.\n The '{manifest_field.name}' " + f"field is currently defined with the following type:\n" + f"{manifest_field.type}\nThe current component to " + f"trying to invoke it with this type:\n{component_field.type}" + ) + raise InvalidPipelineDefinition( + msg, + ) + manifest = manifest.evolve(component_spec, run_id=run_id) load_component = False diff --git a/tests/example_specs/components/arguments/component.yaml b/tests/component/examples/component_specs/arguments/component.yaml similarity index 100% rename from tests/example_specs/components/arguments/component.yaml rename to tests/component/examples/component_specs/arguments/component.yaml diff --git a/tests/example_specs/components/arguments/component_default_args.yaml b/tests/component/examples/component_specs/arguments/component_default_args.yaml similarity index 100% rename from tests/example_specs/components/arguments/component_default_args.yaml rename to tests/component/examples/component_specs/arguments/component_default_args.yaml diff --git a/tests/example_specs/components/arguments/input_manifest.json b/tests/component/examples/component_specs/arguments/input_manifest.json similarity index 60% rename from tests/example_specs/components/arguments/input_manifest.json rename to tests/component/examples/component_specs/arguments/input_manifest.json index d98ddd95b..9ee2494f9 100644 --- a/tests/example_specs/components/arguments/input_manifest.json +++ b/tests/component/examples/component_specs/arguments/input_manifest.json @@ -7,16 +7,12 @@ "cache_key": "00" }, "index": { - "location": "/index" + "location": "/component_1" }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - } - } + "fields": { + "data": { + "type": "binary", + "location": "/component_1" } } } \ No newline at end of file diff --git a/tests/example_specs/components/component.yaml b/tests/component/examples/component_specs/component.yaml similarity index 56% rename from tests/example_specs/components/component.yaml rename to tests/component/examples/component_specs/component.yaml index 19c8d5856..d1f28b76e 100644 --- a/tests/example_specs/components/component.yaml +++ b/tests/component/examples/component_specs/component.yaml @@ -3,20 +3,14 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - additionalFields: false - + images_data: + type: array + items: + type: float32 args: flag: diff --git a/tests/component/examples/component_specs/input_manifest.json b/tests/component/examples/component_specs/input_manifest.json new file mode 100644 index 000000000..80fa0b91d --- /dev/null +++ b/tests/component/examples/component_specs/input_manifest.json @@ -0,0 +1,17 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "/bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/example_component" + }, + "fields": { + "data": { + "location": "/example_component", + "type": "binary" + } + } +} \ No newline at end of file diff --git a/tests/component/examples/data/components/1.yaml b/tests/component/examples/data/components/1.yaml new file mode 100644 index 000000000..95e5e578f --- /dev/null +++ b/tests/component/examples/data/components/1.yaml @@ -0,0 +1,29 @@ +name: Test component 1 +description: This is an example component +image: example_component:latest + +consumes: + Name: + type: "string" + HP: + type: "int32" + + Type 1: + type: "string" + Type 2: + type: "string" + +produces: + Name: + type: "string" + HP: + type: "int32" + Type 1: + type: "string" + Type 2: + type: "string" + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/component/examples/data/manifest.json b/tests/component/examples/data/manifest.json new file mode 100644 index 000000000..cc579fef1 --- /dev/null +++ b/tests/component/examples/data/manifest.json @@ -0,0 +1,29 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "tests/component/examples/data", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component_1" + }, + "fields": { + "Name": { + "type": "string", + "location": "/component_1" + }, + "HP": { + "type": "int32", + "location": "/component_1" + }, + "Type 1": { + "type": "string", + "location": "/component_2" + }, + "Type 2": { + "type": "string", + "location": "/component_2" + } + } +} \ No newline at end of file diff --git a/tests/example_data/subsets_input/properties/part.0.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet similarity index 58% rename from tests/example_data/subsets_input/properties/part.0.parquet rename to tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet index 521704598..fa5d96dad 100644 Binary files a/tests/example_data/subsets_input/properties/part.0.parquet and b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.0.parquet differ diff --git a/tests/example_data/subsets_input/properties/part.1.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.1.parquet similarity index 59% rename from tests/example_data/subsets_input/properties/part.1.parquet rename to tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.1.parquet index ac842a070..0c86db04d 100644 Binary files a/tests/example_data/subsets_input/properties/part.1.parquet and b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.1.parquet differ diff --git a/tests/example_data/subsets_input/properties/part.2.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet similarity index 60% rename from tests/example_data/subsets_input/properties/part.2.parquet rename to tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet index 1d7df89db..d226a4249 100644 Binary files a/tests/example_data/subsets_input/properties/part.2.parquet and b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_1/part.2.parquet differ diff --git a/tests/example_data/subsets_input/types/part.0.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.0.parquet similarity index 50% rename from tests/example_data/subsets_input/types/part.0.parquet rename to tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.0.parquet index 6074b2fc3..80c4500be 100644 Binary files a/tests/example_data/subsets_input/types/part.0.parquet and b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.0.parquet differ diff --git a/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.1.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.1.parquet new file mode 100644 index 000000000..2dd74184f Binary files /dev/null and b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.1.parquet differ diff --git a/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet new file mode 100644 index 000000000..1ae8001c0 Binary files /dev/null and b/tests/component/examples/data/test_pipeline/test_pipeline_12345/component_2/part.2.parquet differ diff --git a/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt b/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt new file mode 100644 index 000000000..4a9ff8afc --- /dev/null +++ b/tests/component/examples/mock_base_path/example_pipeline/cache/42.txt @@ -0,0 +1 @@ +tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file diff --git a/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json new file mode 100644 index 000000000..47c2fe949 --- /dev/null +++ b/tests/component/examples/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json @@ -0,0 +1,31 @@ +{ + "metadata": { + "pipeline_name": "example_pipeline", + "base_path": "tests/example_data/subsets_input/mock_base_path", + "run_id": "example_pipeline_2023", + "component_id": "component_1", + "cache_key": "42" + }, + "index": { + "location": "/component_1" + }, + "fields": + { + "data": { + "type": "binary", + "location": "/component_1" + }, + "height": { + "type": "int32", + "location": "/component_1" + }, + "width": { + "type": "int32", + "location": "/component_1" + }, + "captions": { + "type": "string", + "location": "/component_1" + } + } +} \ No newline at end of file diff --git a/tests/test_component.py b/tests/component/test_component.py similarity index 92% rename from tests/test_component.py rename to tests/component/test_component.py index e759bd367..830ce2963 100644 --- a/tests/test_component.py +++ b/tests/component/test_component.py @@ -23,8 +23,8 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest, Metadata -components_path = Path(__file__).parent / "example_specs/components" -base_path = Path(__file__).parent / "example_specs/mock_base_path" +components_path = Path(__file__).parent / "examples/component_specs" +base_path = Path(__file__).parent / "examples/mock_base_path" N_PARTITIONS = 2 @@ -377,38 +377,22 @@ def test_wrap_transform(): "description": "Component for testing", "image": "component:test", "consumes": { - "image": { - "fields": { - "height": { - "type": "int16", - }, - "width": { - "type": "int16", - }, - }, + "image_height": { + "type": "int16", }, - "caption": { - "fields": { - "text": { - "type": "string", - }, - }, + "image_width": { + "type": "int16", + }, + "caption_text": { + "type": "string", }, }, "produces": { - "caption": { - "fields": { - "text": { - "type": "string", - }, - }, + "caption_text": { + "type": "string", }, - "image": { - "fields": { - "height": { - "type": "int16", - }, - }, + "image_height": { + "type": "int16", }, }, }, @@ -425,9 +409,9 @@ def test_wrap_transform(): def transform(dataframe: pd.DataFrame) -> pd.DataFrame: # Check hierarchical columns assert dataframe.columns.tolist() == [ - ("image", "height"), - ("image", "width"), - ("caption", "text"), + "image_height", + "image_width", + "caption_text", ] return dataframe diff --git a/tests/test_data_io.py b/tests/component/test_data_io.py similarity index 61% rename from tests/test_data_io.py rename to tests/component/test_data_io.py index 9ade4a329..d9dad121f 100644 --- a/tests/test_data_io.py +++ b/tests/component/test_data_io.py @@ -8,8 +8,8 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -manifest_path = Path(__file__).parent / "example_data/manifest.json" -component_spec_path = Path(__file__).parent / "example_data/components/1.yaml" +manifest_path = Path(__file__).parent / "examples/data/manifest.json" +component_spec_path = Path(__file__).parent / "examples/data/components/1.yaml" NUMBER_OF_TEST_ROWS = 151 @@ -37,33 +37,16 @@ def dataframe(manifest, component_spec): return data_loader.load_dataframe() -def test_load_index(manifest, component_spec): - """Test the loading of just the index.""" - data_loader = DaskDataLoader(manifest=manifest, component_spec=component_spec) - index_df = data_loader._load_index() - assert len(index_df) == NUMBER_OF_TEST_ROWS - assert index_df.index.name == "id" - - -def test_load_subset(manifest, component_spec): - """Test the loading of one field of a subset.""" - data_loader = DaskDataLoader(manifest=manifest, component_spec=component_spec) - subset_df = data_loader._load_subset(subset_name="types", fields=["Type 1"]) - assert len(subset_df) == NUMBER_OF_TEST_ROWS - assert list(subset_df.columns) == ["types_Type 1"] - assert subset_df.index.name == "id" - - def test_load_dataframe(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" + """Test merging of fields in a dataframe based on a component_spec.""" dl = DaskDataLoader(manifest=manifest, component_spec=component_spec) dataframe = dl.load_dataframe() assert len(dataframe) == NUMBER_OF_TEST_ROWS assert list(dataframe.columns) == [ - "properties_Name", - "properties_HP", - "types_Type 1", - "types_Type 2", + "Name", + "HP", + "Type 1", + "Type 2", ] assert dataframe.index.name == "id" @@ -78,7 +61,7 @@ def test_load_dataframe_default(manifest, component_spec): def test_load_dataframe_rows(manifest, component_spec): - """Test merging of subsets in a dataframe based on a component_spec.""" + """Test merging of fields in a dataframe based on a component_spec.""" dl = DaskDataLoader( manifest=manifest, component_spec=component_spec, @@ -89,34 +72,7 @@ def test_load_dataframe_rows(manifest, component_spec): assert dataframe.npartitions == expected_partitions -def test_write_index( - tmp_path_factory, - dataframe, - manifest, - component_spec, - dask_client, -): - """Test writing out the index.""" - with tmp_path_factory.mktemp("temp") as fn: - # override the base path of the manifest with the temp dir - manifest.update_metadata("base_path", str(fn)) - data_writer = DaskDataWriter( - manifest=manifest, - component_spec=component_spec, - ) - # write out index to temp dir - data_writer.write_dataframe(dataframe, dask_client) - number_workers = os.cpu_count() - # read written data and assert - dataframe = dd.read_parquet(fn / "index") - assert len(dataframe) == NUMBER_OF_TEST_ROWS - assert dataframe.index.name == "id" - assert dataframe.npartitions in list( - range(number_workers - 1, number_workers + 2), - ) - - -def test_write_subsets( +def test_write_dataset( tmp_path_factory, dataframe, manifest, @@ -125,11 +81,7 @@ def test_write_subsets( ): """Test writing out subsets.""" # Dictionary specifying the expected subsets to write and their column names - subset_columns_dict = { - "index": [], - "properties": ["Name", "HP"], - "types": ["Type 1", "Type 2"], - } + columns = ["Name", "HP", "Type 1", "Type 2"] with tmp_path_factory.mktemp("temp") as fn: # override the base path of the manifest with the temp dir manifest.update_metadata("base_path", str(fn)) @@ -137,13 +89,13 @@ def test_write_subsets( # write dataframe to temp dir data_writer.write_dataframe(dataframe, dask_client) # read written data and assert - for subset, subset_columns in subset_columns_dict.items(): - dataframe = dd.read_parquet(fn / subset) - assert len(dataframe) == NUMBER_OF_TEST_ROWS - assert list(dataframe.columns) == subset_columns - assert dataframe.index.name == "id" + dataframe = dd.read_parquet(fn) + assert len(dataframe) == NUMBER_OF_TEST_ROWS + assert list(dataframe.columns) == columns + assert dataframe.index.name == "id" +# TODO: check if this is still needed? def test_write_reset_index( tmp_path_factory, dataframe, @@ -151,7 +103,7 @@ def test_write_reset_index( component_spec, dask_client, ): - """Test writing out the index and subsets that have no dask index and checking + """Test writing out the index and fields that have no dask index and checking if the id index was created. """ dataframe = dataframe.reset_index(drop=True) @@ -160,10 +112,8 @@ def test_write_reset_index( data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) data_writer.write_dataframe(dataframe, dask_client) - - for subset in ["properties", "types", "index"]: - dataframe = dd.read_parquet(fn / subset) - assert dataframe.index.name == "id" + dataframe = dd.read_parquet(fn) + assert dataframe.index.name == "id" @pytest.mark.parametrize("partitions", list(range(1, 5))) @@ -189,29 +139,51 @@ def test_write_divisions( # noqa: PLR0913 data_writer.write_dataframe(dataframe, dask_client) - for target in ["properties", "types", "index"]: - dataframe = dd.read_parquet(fn / target) - assert dataframe.index.name == "id" - assert dataframe.npartitions == partitions + dataframe = dd.read_parquet(fn) + assert dataframe.index.name == "id" + assert dataframe.npartitions == partitions + + +def test_write_fields_invalid( + tmp_path_factory, + dataframe, + manifest, + component_spec, + dask_client, +): + """Test writing out fields but the dataframe columns are incomplete.""" + with tmp_path_factory.mktemp("temp") as fn: + # override the base path of the manifest with the temp dir + manifest.update_metadata("base_path", str(fn)) + # Drop one of the columns required in the output + dataframe = dataframe.drop(["Type 2"], axis=1) + data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) + expected_error_msg = ( + r"Fields \['Type 2'\] defined in output dataset " + r"but not found in dataframe" + ) + with pytest.raises(ValueError, match=expected_error_msg): + data_writer.write_dataframe(dataframe, dask_client) -def test_write_subsets_invalid( +def test_write_fields_invalid_several_fields_missing( tmp_path_factory, dataframe, manifest, component_spec, dask_client, ): - """Test writing out subsets but the dataframe columns are incomplete.""" + """Test writing out fields but the dataframe columns are incomplete.""" with tmp_path_factory.mktemp("temp") as fn: # override the base path of the manifest with the temp dir manifest.update_metadata("base_path", str(fn)) # Drop one of the columns required in the output - dataframe = dataframe.drop(["types_Type 2"], axis=1) + dataframe = dataframe.drop(["Type 1"], axis=1) + dataframe = dataframe.drop(["Type 2"], axis=1) data_writer = DaskDataWriter(manifest=manifest, component_spec=component_spec) expected_error_msg = ( - r"Field \['types_Type 2'\] not in index defined in output subset " - r"types but not found in dataframe" + r"Fields \['Type 1', 'Type 2'\] defined in output dataset " + r"but not found in dataframe" ) with pytest.raises(ValueError, match=expected_error_msg): data_writer.write_dataframe(dataframe, dask_client) diff --git a/tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml b/tests/core/examples/component_specs/invalid_component.yaml similarity index 88% rename from tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/core/examples/component_specs/invalid_component.yaml index abe5091ea..d1c88c444 100644 --- a/tests/example_pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml +++ b/tests/core/examples/component_specs/invalid_component.yaml @@ -1,4 +1,4 @@ -name: First component +name: Example component description: This is an example component image: example_component:latest @@ -14,7 +14,7 @@ produces: data: type: string -args: +Arguments: storage_args: description: Storage arguments type: str \ No newline at end of file diff --git a/tests/example_specs/component_specs/kubeflow_component.yaml b/tests/core/examples/component_specs/kubeflow_component.yaml similarity index 100% rename from tests/example_specs/component_specs/kubeflow_component.yaml rename to tests/core/examples/component_specs/kubeflow_component.yaml diff --git a/tests/example_specs/evolution_examples/1/component.yaml b/tests/core/examples/component_specs/valid_component.yaml similarity index 62% rename from tests/example_specs/evolution_examples/1/component.yaml rename to tests/core/examples/component_specs/valid_component.yaml index 22ae0feb1..1215af1bd 100644 --- a/tests/example_specs/evolution_examples/1/component.yaml +++ b/tests/core/examples/component_specs/valid_component.yaml @@ -1,20 +1,21 @@ name: Example component description: This is an example component image: example_component:latest +tags: + - Data loading consumes: images: - fields: - data: - type: binary - -produces: + type: binary + embeddings: - fields: - data: - type: array - items: - type: float32 + type: array + items: + type: float32 + +produces: + captions: + type: string args: storage_args: diff --git a/tests/example_specs/component_specs/valid_component_no_args.yaml b/tests/core/examples/component_specs/valid_component_no_args.yaml similarity index 59% rename from tests/example_specs/component_specs/valid_component_no_args.yaml rename to tests/core/examples/component_specs/valid_component_no_args.yaml index c3adfa6aa..de11cb2ee 100644 --- a/tests/example_specs/component_specs/valid_component_no_args.yaml +++ b/tests/core/examples/component_specs/valid_component_no_args.yaml @@ -4,12 +4,13 @@ image: example_component:latest consumes: images: - fields: - data: - type: binary + type: binary + + embeddings: + type: array + items: + type: float32 produces: captions: - fields: - data: - type: string \ No newline at end of file + type: string diff --git a/tests/example_specs/evolution_examples/8/component.yaml b/tests/core/examples/evolution_examples/1/component.yaml similarity index 62% rename from tests/example_specs/evolution_examples/8/component.yaml rename to tests/core/examples/evolution_examples/1/component.yaml index 5c204b9c2..e91ae6f46 100644 --- a/tests/example_specs/evolution_examples/8/component.yaml +++ b/tests/core/examples/evolution_examples/1/component.yaml @@ -3,10 +3,14 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary + +produces: + embeddings_data: + type: array + items: + type: float32 args: storage_args: diff --git a/tests/core/examples/evolution_examples/1/output_manifest.json b/tests/core/examples/evolution_examples/1/output_manifest.json new file mode 100644 index 000000000..2a73e5f29 --- /dev/null +++ b/tests/core/examples/evolution_examples/1/output_manifest.json @@ -0,0 +1,36 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + }, + "embeddings_data": { + "type": "array", + "items": { + "type": "float32" + }, + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/4/component.yaml b/tests/core/examples/evolution_examples/2/component.yaml similarity index 59% rename from tests/example_specs/evolution_examples/4/component.yaml rename to tests/core/examples/evolution_examples/2/component.yaml index 067b06da0..95d9300d1 100644 --- a/tests/example_specs/evolution_examples/4/component.yaml +++ b/tests/core/examples/evolution_examples/2/component.yaml @@ -3,16 +3,14 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary - + images_data: + type: binary + produces: - images: - fields: - encoding: - type: string + images_data: + type: binary + +previous_index: "true" # Only used to remove old fields for now args: storage_args: diff --git a/tests/core/examples/evolution_examples/2/output_manifest.json b/tests/core/examples/evolution_examples/2/output_manifest.json new file mode 100644 index 000000000..db62fda15 --- /dev/null +++ b/tests/core/examples/evolution_examples/2/output_manifest.json @@ -0,0 +1,17 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/3/component.yaml b/tests/core/examples/evolution_examples/3/component.yaml new file mode 100644 index 000000000..13b1427b3 --- /dev/null +++ b/tests/core/examples/evolution_examples/3/component.yaml @@ -0,0 +1,16 @@ +name: Example component 1 +description: This is an example component +image: example_component_1:latest + +consumes: + images_data: + type: binary + +produces: + images_data: + type: string + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/core/examples/evolution_examples/3/output_manifest.json b/tests/core/examples/evolution_examples/3/output_manifest.json new file mode 100644 index 000000000..b11f7d8a3 --- /dev/null +++ b/tests/core/examples/evolution_examples/3/output_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component_1" + }, + "index":{ + "location":"/example_component_1" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "string", + "location":"/example_component_1" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/4/component.yaml b/tests/core/examples/evolution_examples/4/component.yaml new file mode 100644 index 000000000..1b766036d --- /dev/null +++ b/tests/core/examples/evolution_examples/4/component.yaml @@ -0,0 +1,12 @@ +name: Example component 1 +description: This is an example component +image: example_component_1:latest + +consumes: + images_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/core/examples/evolution_examples/4/output_manifest.json b/tests/core/examples/evolution_examples/4/output_manifest.json new file mode 100644 index 000000000..929c380ab --- /dev/null +++ b/tests/core/examples/evolution_examples/4/output_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"custom_run_id", + "component_id":"example_component_1" + }, + "index":{ + "location":"/example_component_1" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/evolution_examples/input_manifest.json b/tests/core/examples/evolution_examples/input_manifest.json new file mode 100644 index 000000000..664367cc2 --- /dev/null +++ b/tests/core/examples/evolution_examples/input_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata":{ + "pipeline_name":"test_pipeline", + "base_path":"gs://bucket", + "run_id":"12345", + "component_id":"example_component" + }, + "index":{ + "location":"/example_component" + }, + "fields": { + "images_width": { + "type": "int32", + "location":"/example_component" + }, + "images_height": { + "type": "int32", + "location":"/example_component" + }, + "images_data": { + "type": "binary", + "location":"/example_component" + }, + "captions_data": { + "type": "binary", + "location":"/example_component" + } + } +} \ No newline at end of file diff --git a/tests/core/examples/manifests/invalid_manifest.json b/tests/core/examples/manifests/invalid_manifest.json new file mode 100644 index 000000000..51ec6c5e5 --- /dev/null +++ b/tests/core/examples/manifests/invalid_manifest.json @@ -0,0 +1,14 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "gs://bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component1" + }, + "fields": { + "images": {} + } +} \ No newline at end of file diff --git a/tests/core/examples/manifests/valid_manifest.json b/tests/core/examples/manifests/valid_manifest.json new file mode 100644 index 000000000..0f7c58126 --- /dev/null +++ b/tests/core/examples/manifests/valid_manifest.json @@ -0,0 +1,29 @@ +{ + "metadata": { + "pipeline_name": "test_pipeline", + "base_path": "gs://bucket", + "run_id": "test_pipeline_12345", + "component_id": "67890" + }, + "index": { + "location": "/component1" + }, + "fields":{ + "images": { + "location": "/component1", + "type": "binary" + }, + "height": { + "location": "/component2", + "type": "int32" + }, + "width": { + "location": "/component2", + "type": "int32" + }, + "caption": { + "location": "/component3", + "type": "string" + } + } +} \ No newline at end of file diff --git a/tests/test_component_specs.py b/tests/core/test_component_specs.py similarity index 85% rename from tests/test_component_specs.py rename to tests/core/test_component_specs.py index caf0344de..dcbf4c2ed 100644 --- a/tests/test_component_specs.py +++ b/tests/core/test_component_specs.py @@ -8,13 +8,12 @@ import yaml from fondant.core.component_spec import ( ComponentSpec, - ComponentSubset, KubeflowComponentSpec, ) from fondant.core.exceptions import InvalidComponentSpec from fondant.core.schema import Type -component_specs_path = Path(__file__).parent / "example_specs/component_specs" +component_specs_path = Path(__file__).parent / "examples/component_specs" @pytest.fixture() @@ -49,12 +48,19 @@ def test_component_spec_pkgutil_error(mock_get_data): def test_component_spec_validation(valid_fondant_schema, invalid_fondant_schema): - """Test that the manifest is validated correctly on instantiation.""" + """Test that the component spec is validated correctly on instantiation.""" ComponentSpec(valid_fondant_schema) with pytest.raises(InvalidComponentSpec): ComponentSpec(invalid_fondant_schema) +def test_component_spec_load_from_file(valid_fondant_schema, invalid_fondant_schema): + """Test that the component spec is validated correctly on instantiation.""" + ComponentSpec.from_file(component_specs_path / "valid_component.yaml") + with pytest.raises(InvalidComponentSpec): + ComponentSpec.from_file(component_specs_path / "invalid_component.yaml") + + def test_attribute_access(valid_fondant_schema): """ Test that attributes can be accessed as expected: @@ -65,8 +71,8 @@ def test_attribute_access(valid_fondant_schema): assert fondant_component.name == "Example component" assert fondant_component.description == "This is an example component" - assert fondant_component.consumes["images"].fields["data"].type == Type("binary") - assert fondant_component.consumes["embeddings"].fields["data"].type == Type.list( + assert fondant_component.consumes["images"].type == Type("binary") + assert fondant_component.consumes["embeddings"].type == Type.list( Type("float32"), ) @@ -129,15 +135,3 @@ def test_kubeflow_component_spec_repr(valid_kubeflow_schema): kubeflow_component_spec = KubeflowComponentSpec(valid_kubeflow_schema) expected_repr = f"KubeflowComponentSpec({valid_kubeflow_schema!r})" assert repr(kubeflow_component_spec) == expected_repr - - -def test_component_subset_repr(): - """Test that the __repr__ method of ComponentSubset returns the expected string.""" - component_subset_schema = { - "name": "Example subset", - "description": "This is an example subset", - } - - component_subset = ComponentSubset(component_subset_schema) - expected_repr = f"ComponentSubset({component_subset_schema!r})" - assert repr(component_subset) == expected_repr diff --git a/tests/test_manifest.py b/tests/core/test_manifest.py similarity index 52% rename from tests/test_manifest.py rename to tests/core/test_manifest.py index 3af3ea425..c24d27c9c 100644 --- a/tests/test_manifest.py +++ b/tests/core/test_manifest.py @@ -3,10 +3,12 @@ from pathlib import Path import pytest +from fondant.core.component_spec import ComponentSpec from fondant.core.exceptions import InvalidManifest -from fondant.core.manifest import Field, Index, Manifest, Subset, Type +from fondant.core.manifest import Field, Manifest, Type -manifest_path = Path(__file__).parent / "example_specs/manifests" +manifest_path = Path(__file__).parent / "examples" / "manifests" +component_specs_path = Path(__file__).parent / "examples" / "component_specs" @pytest.fixture() @@ -28,53 +30,6 @@ def test_manifest_validation(valid_manifest, invalid_manifest): Manifest(invalid_manifest) -def test_subset_init(): - """Test initializing a subset.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - subset = Subset(specification=subset_spec, base_path="/tmp") - assert subset.location == "/tmp/images/ABC/123" - assert ( - subset.__repr__() - == "Subset({'location': '/images/ABC/123', 'fields': {'data': {'type': 'binary'}}})" - ) - - -def test_subset_fields(): - """Test manipulating subset fields.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } - subset = Subset(specification=subset_spec, base_path="/tmp") - - # add a field - subset.add_field(name="data2", type_=Type("binary")) - assert "data2" in subset.fields - - # add a duplicate field - with pytest.raises(ValueError, match="A field with name data2 already exists"): - subset.add_field(name="data2", type_=Type("binary")) - - # add a duplicate field but overwrite - subset.add_field(name="data2", type_=Type("string"), overwrite=True) - assert subset.fields["data2"].type == Type("string") - - # remove a field - subset.remove_field(name="data2") - assert "data2" not in subset.fields - - def test_set_base_path(valid_manifest): """Test altering the base path in the manifest.""" manifest = Manifest(valid_manifest) @@ -108,9 +63,9 @@ def test_attribute_access(valid_manifest): manifest = Manifest(valid_manifest) assert manifest.metadata == valid_manifest["metadata"] - assert manifest.index.location == "gs://bucket/index" - assert manifest.subsets["images"].location == "gs://bucket/images" - assert manifest.subsets["images"].fields["data"].type == Type("binary") + assert manifest.index.location == "/component1" + assert manifest.fields["images"].location == "/component1" + assert manifest.fields["images"].type == Type("binary") def test_manifest_creation(): @@ -129,8 +84,9 @@ def test_manifest_creation(): cache_key=cache_key, ) - manifest.add_subset("images", [("width", Type("int32")), ("height", Type("int32"))]) - manifest.subsets["images"].add_field("data", Type("binary")) + manifest.add_or_update_field(Field(name="width", type=Type("int32"))) + manifest.add_or_update_field(Field(name="height", type=Type("int32"))) + manifest.add_or_update_field(Field(name="data", type=Type("binary"))) assert manifest._specification == { "metadata": { @@ -140,21 +96,19 @@ def test_manifest_creation(): "component_id": component_id, "cache_key": cache_key, }, - "index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"}, - "subsets": { - "images": { - "location": f"/{pipeline_name}/{run_id}/{component_id}/images", - "fields": { - "width": { - "type": "int32", - }, - "height": { - "type": "int32", - }, - "data": { - "type": "binary", - }, - }, + "index": {"location": f"/{component_id}"}, + "fields": { + "width": { + "type": "int32", + "location": f"/{component_id}", + }, + "height": { + "type": "int32", + "location": f"/{component_id}", + }, + "data": { + "type": "binary", + "location": f"/{component_id}", }, }, } @@ -172,7 +126,7 @@ def test_manifest_repr(): manifest.__repr__() == "Manifest({'metadata': {'base_path': '/', 'pipeline_name': 'NAME', 'run_id': 'A'," " 'component_id': '1', 'cache_key': '42'}," - " 'index': {'location': '/NAME/A/1/index'}, 'subsets': {}})" + " 'index': {'location': '/1'}, 'fields': {}})" ) @@ -181,33 +135,30 @@ def test_manifest_alteration(valid_manifest): manifest = Manifest(valid_manifest) # test adding a subset - manifest.add_subset( - "images2", - [("width", Type("int32")), ("height", Type("int32"))], - ) - assert "images2" in manifest.subsets + manifest.add_or_update_field(Field(name="width2", type=Type("int32"))) + manifest.add_or_update_field(Field(name="height2", type=Type("int32"))) + + assert "width2" in manifest.fields + assert "height2" in manifest.fields # test adding a duplicate subset - with pytest.raises(ValueError, match="A subset with name images2 already exists"): - manifest.add_subset( - "images2", - [("width", Type("int32")), ("height", Type("int32"))], - ) + with pytest.raises(ValueError, match="A field with name width2 already exists"): + manifest.add_or_update_field(Field(name="width2", type=Type("int32"))) # test removing a subset - manifest.remove_subset("images2") - assert "images2" not in manifest.subsets + manifest.remove_field("width2") + assert "images2" not in manifest.fields # test removing a nonexistant subset - with pytest.raises(ValueError, match="Subset pictures not found in specification"): - manifest.remove_subset("pictures") + with pytest.raises(ValueError, match="Field pictures not found in specification"): + manifest.remove_field("pictures") def test_manifest_copy_and_adapt(valid_manifest): """Test that a manifest can be copied and adapted without changing the original.""" manifest = Manifest(valid_manifest) new_manifest = manifest.copy() - new_manifest.remove_subset("images") + new_manifest.remove_field("images") assert manifest._specification == valid_manifest assert new_manifest._specification != valid_manifest @@ -218,22 +169,59 @@ def test_no_validate_schema(monkeypatch, valid_manifest): Manifest(valid_manifest) -def test_index_fields(): - """Test that the fields property of Index returns the expected fields.""" - subset_spec = { - "location": "/images/ABC/123", - "fields": { - "data": { - "type": "binary", - }, - }, - } +def test_evolve_manifest(): + """Test that the fields are evolved as expected.""" + run_id = "A" + spec = ComponentSpec.from_file(component_specs_path / "valid_component.yaml") + input_manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) - index = Index(specification=subset_spec, base_path="/tmp") + output_manifest = input_manifest.evolve(component_spec=spec, run_id=run_id) - expected_fields = { - "id": Field(name="id", type=Type("string")), - "source": Field(name="source", type=Type("string")), - } + assert output_manifest.base_path == input_manifest.base_path + assert output_manifest.run_id == run_id + assert output_manifest.index.location == "/" + spec.component_folder_name + assert output_manifest.fields["captions"].type.name == "string" + + +def test_fields(): + """Test that the fields can added and updated as expected.""" + run_id = "A" + manifest = Manifest.create( + pipeline_name="NAME", + base_path="/base_path", + run_id=run_id, + component_id="component_1", + cache_key="42", + ) + + # add a field + manifest.add_or_update_field(Field(name="field_1", type=Type("int32"))) + assert "field_1" in manifest.fields + + # add a duplicate field, but overwrite (update) + manifest.add_or_update_field( + Field(name="field_1", type=Type("string")), + overwrite=True, + ) + assert manifest.fields["field_1"].type.name == "string" + + # add duplicate field + with pytest.raises( + ValueError, + match="A field with name field_1 already exists. Set overwrite to true, " + "if you want to update the field.", + ): + manifest.add_or_update_field( + Field(name="field_1", type=Type("string")), + overwrite=False, + ) - assert index.fields == expected_fields + # delete a field + manifest.remove_field(name="field_1") + assert "field_1" not in manifest.fields diff --git a/tests/test_manifest_evolution.py b/tests/core/test_manifest_evolution.py similarity index 83% rename from tests/test_manifest_evolution.py rename to tests/core/test_manifest_evolution.py index c79b76aaf..0d9181701 100644 --- a/tests/test_manifest_evolution.py +++ b/tests/core/test_manifest_evolution.py @@ -6,7 +6,7 @@ from fondant.core.component_spec import ComponentSpec from fondant.core.manifest import Manifest -examples_path = Path(__file__).parent / "example_specs/evolution_examples" +examples_path = Path(__file__).parent / "examples/evolution_examples" @pytest.fixture() @@ -41,7 +41,7 @@ def test_component_spec_location_update(): with open(examples_path / "input_manifest.json") as f: input_manifest = json.load(f) - with open(examples_path / "7/component.yaml") as f: + with open(examples_path / "4/component.yaml") as f: specification = yaml.safe_load(f) manifest = Manifest(input_manifest) @@ -50,7 +50,4 @@ def test_component_spec_location_update(): component_spec=component_spec, ) - assert ( - evolved_manifest._specification["subsets"]["images"]["location"] - == "/test_pipeline/12345/example_component/images" - ) + assert evolved_manifest.index.location == "/" + component_spec.component_folder_name diff --git a/tests/test_schema.py b/tests/core/test_schema.py similarity index 100% rename from tests/test_schema.py rename to tests/core/test_schema.py diff --git a/tests/example_data/components/1.yaml b/tests/example_data/components/1.yaml deleted file mode 100644 index 0c245a512..000000000 --- a/tests/example_data/components/1.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: Test component 1 -description: This is an example component -image: example_component:latest - -consumes: - properties: - fields: - Name: - type: "string" - HP: - type: "int32" - types: - fields: - Type 1: - type: "string" - Type 2: - type: "string" - -produces: - properties: - fields: - Name: - type: "string" - HP: - type: "int32" - types: - fields: - Type 1: - type: "string" - Type 2: - type: "string" -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/example_data/manifest.json b/tests/example_data/manifest.json deleted file mode 100644 index 8fe4ef16b..000000000 --- a/tests/example_data/manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "tests/example_data/subsets_input", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index" - }, - "subsets": { - "properties": { - "location": "/properties", - "fields": { - "Name": { - "type": "string" - }, - "HP": { - "type": "int32" - } - } - }, - "types": { - "location": "/types", - "fields": { - "Type 1": { - "type": "string" - }, - "Type 2": { - "type": "string" - } - } - } - } - } \ No newline at end of file diff --git a/tests/example_data/raw/split.py b/tests/example_data/raw/split.py deleted file mode 100644 index 6800ee323..000000000 --- a/tests/example_data/raw/split.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -This is a small script to split the raw data into different subsets to be used while testing. - -The data is the 151 first pokemon and the following fields are available: - -'id', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', -'Sp. Atk', 'Sp. Def', 'Speed', 'source', 'Legendary' - - -""" -from pathlib import Path - -import dask.dataframe as dd - -data_path = Path(__file__).parent -output_path = Path(__file__).parent.parent / "subsets_input/" - - -def split_into_subsets(): - # read in complete dataset - master_df = dd.read_parquet(path=data_path / "testset.parquet") - master_df = master_df.set_index("id", sorted=True) - master_df = master_df.repartition(divisions=[0, 50, 100, 151], force=True) - - # create index subset - index_df = master_df.index.to_frame().drop(columns=["id"]) - index_df.to_parquet(output_path / "index") - - # create properties subset - properties_df = master_df[["Name", "HP"]] - properties_df.to_parquet(output_path / "properties") - - # create types subset - types_df = master_df[["Type 1", "Type 2"]] - types_df.to_parquet(output_path / "types") - - -if __name__ == "__main__": - split_into_subsets() diff --git a/tests/example_data/raw/testset.parquet b/tests/example_data/raw/testset.parquet deleted file mode 100644 index e7b9c625f..000000000 Binary files a/tests/example_data/raw/testset.parquet and /dev/null differ diff --git a/tests/example_data/subsets_input/index/part.0.parquet b/tests/example_data/subsets_input/index/part.0.parquet deleted file mode 100644 index 5f0f7a24d..000000000 Binary files a/tests/example_data/subsets_input/index/part.0.parquet and /dev/null differ diff --git a/tests/example_data/subsets_input/index/part.1.parquet b/tests/example_data/subsets_input/index/part.1.parquet deleted file mode 100644 index be1028aaa..000000000 Binary files a/tests/example_data/subsets_input/index/part.1.parquet and /dev/null differ diff --git a/tests/example_data/subsets_input/index/part.2.parquet b/tests/example_data/subsets_input/index/part.2.parquet deleted file mode 100644 index c745a39dc..000000000 Binary files a/tests/example_data/subsets_input/index/part.2.parquet and /dev/null differ diff --git a/tests/example_data/subsets_input/types/part.1.parquet b/tests/example_data/subsets_input/types/part.1.parquet deleted file mode 100644 index 9d20d681f..000000000 Binary files a/tests/example_data/subsets_input/types/part.1.parquet and /dev/null differ diff --git a/tests/example_data/subsets_input/types/part.2.parquet b/tests/example_data/subsets_input/types/part.2.parquet deleted file mode 100644 index b89ce72bd..000000000 Binary files a/tests/example_data/subsets_input/types/part.2.parquet and /dev/null differ diff --git a/tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml b/tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml deleted file mode 100644 index 389da55a1..000000000 --- a/tests/example_pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - caption: - fields: - data: - type: string - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml deleted file mode 100644 index 3c996e9d6..000000000 --- a/tests/example_pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - description: - type: binary - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml b/tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml deleted file mode 100644 index 45964a8c6..000000000 --- a/tests/example_pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: First component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - captions: - fields: - data: - type: string - - images: - fields: - data: - type: binary -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml deleted file mode 100644 index c02abbaa1..000000000 --- a/tests/example_pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: Second component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: string - - captions: - fields: - data: - type: string - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml deleted file mode 100644 index 3cda0cc6c..000000000 --- a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: Fourth component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - images: - fields: - data: - type: binary - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str - some_list: - description: Some list - type: list - items: - type: int \ No newline at end of file diff --git a/tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml deleted file mode 100644 index 091a7d9d5..000000000 --- a/tests/example_pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Third component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - - captions: - fields: - data: - type: string - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - images: - fields: - data: - type: binary - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/component_specs/valid_component.yaml b/tests/example_specs/component_specs/valid_component.yaml deleted file mode 100644 index c4b99e837..000000000 --- a/tests/example_specs/component_specs/valid_component.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest -tags: - - Data loading - -consumes: - images: - fields: - data: - type: binary - - embeddings: - fields: - data: - type: array - items: - type: float32 - -produces: - captions: - fields: - data: - type: string - -args: - storage_args: - description: Storage arguments - type: str \ No newline at end of file diff --git a/tests/example_specs/components/input_manifest.json b/tests/example_specs/components/input_manifest.json deleted file mode 100644 index 7af13d599..000000000 --- a/tests/example_specs/components/input_manifest.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "/bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index/12345/example_component" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - } - } - } - - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/1/output_manifest.json b/tests/example_specs/evolution_examples/1/output_manifest.json deleted file mode 100644 index 17b94c0b0..000000000 --- a/tests/example_specs/evolution_examples/1/output_manifest.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/2/component.yaml b/tests/example_specs/evolution_examples/2/component.yaml deleted file mode 100644 index f37ff99d1..000000000 --- a/tests/example_specs/evolution_examples/2/component.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - additionalSubsets: false - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/2/output_manifest.json b/tests/example_specs/evolution_examples/2/output_manifest.json deleted file mode 100644 index 3a40b1c9d..000000000 --- a/tests/example_specs/evolution_examples/2/output_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/3/component.yaml b/tests/example_specs/evolution_examples/3/component.yaml deleted file mode 100644 index 6753a083b..000000000 --- a/tests/example_specs/evolution_examples/3/component.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - additionalFields: false - additionalSubsets: false - -produces: - embeddings: - fields: - data: - type: array - items: - type: float32 - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/3/output_manifest.json b/tests/example_specs/evolution_examples/3/output_manifest.json deleted file mode 100644 index a9abda6d0..000000000 --- a/tests/example_specs/evolution_examples/3/output_manifest.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "data":{ - "type":"binary" - } - } - }, - "embeddings":{ - "location":"/test_pipeline/custom_run_id/example_component/embeddings", - "fields":{ - "data":{ - "type":"array", - "items":{ - "type":"float32" - } - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/4/output_manifest.json b/tests/example_specs/evolution_examples/4/output_manifest.json deleted file mode 100644 index 24af4f2ac..000000000 --- a/tests/example_specs/evolution_examples/4/output_manifest.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - }, - "encoding":{ - "type":"string" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/5/component.yaml b/tests/example_specs/evolution_examples/5/component.yaml deleted file mode 100644 index 93aaf68b3..000000000 --- a/tests/example_specs/evolution_examples/5/component.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - additionalFields: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/5/output_manifest.json b/tests/example_specs/evolution_examples/5/output_manifest.json deleted file mode 100644 index 8bcf6141d..000000000 --- a/tests/example_specs/evolution_examples/5/output_manifest.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "encoding":{ - "type":"string" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/6/component.yaml b/tests/example_specs/evolution_examples/6/component.yaml deleted file mode 100644 index 065061791..000000000 --- a/tests/example_specs/evolution_examples/6/component.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - encoding: - type: string - additionalFields: false - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/6/output_manifest.json b/tests/example_specs/evolution_examples/6/output_manifest.json deleted file mode 100644 index b7521bf66..000000000 --- a/tests/example_specs/evolution_examples/6/output_manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "encoding":{ - "type":"string" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/7/component.yaml b/tests/example_specs/evolution_examples/7/component.yaml deleted file mode 100644 index 5746ffa4d..000000000 --- a/tests/example_specs/evolution_examples/7/component.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Example component -description: This is an example component -image: example_component:latest - -consumes: - images: - fields: - data: - type: binary - -produces: - images: - fields: - data: - type: string - additionalFields: false - additionalSubsets: false - -args: - storage_args: - description: Storage arguments - type: str diff --git a/tests/example_specs/evolution_examples/7/output_manifest.json b/tests/example_specs/evolution_examples/7/output_manifest.json deleted file mode 100644 index a9eb8a308..000000000 --- a/tests/example_specs/evolution_examples/7/output_manifest.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"custom_run_id", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/custom_run_id/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/custom_run_id/example_component/images", - "fields":{ - "data":{ - "type":"string" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/evolution_examples/8/output_manifest.json b/tests/example_specs/evolution_examples/8/output_manifest.json deleted file mode 100644 index de2621c49..000000000 --- a/tests/example_specs/evolution_examples/8/output_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "gs://bucket", - "run_id": "custom_run_id", - "component_id": "example_component" - }, - "index": { - "location": "/test_pipeline/custom_run_id/example_component/index" - }, - "subsets": { - "images": { - "location": "/test_pipeline/12345/example_component/images", - "fields": { - "width": { - "type": "int32" - }, - "height": { - "type": "int32" - }, - "data": { - "type": "binary" - } - } - }, - "captions": { - "location": "/test_pipeline/12345/example_component/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} diff --git a/tests/example_specs/evolution_examples/input_manifest.json b/tests/example_specs/evolution_examples/input_manifest.json deleted file mode 100644 index 2ecf37243..000000000 --- a/tests/example_specs/evolution_examples/input_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata":{ - "pipeline_name":"test_pipeline", - "base_path":"gs://bucket", - "run_id":"12345", - "component_id":"example_component" - }, - "index":{ - "location":"/test_pipeline/12345/example_component/index" - }, - "subsets":{ - "images":{ - "location":"/test_pipeline/12345/example_component/images", - "fields":{ - "width":{ - "type":"int32" - }, - "height":{ - "type":"int32" - }, - "data":{ - "type":"binary" - } - } - }, - "captions":{ - "location":"/test_pipeline/12345/example_component/captions", - "fields":{ - "data":{ - "type":"binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/manifests/invalid_manifest.json b/tests/example_specs/manifests/invalid_manifest.json deleted file mode 100644 index 3fe8b1097..000000000 --- a/tests/example_specs/manifests/invalid_manifest.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "metadata": { - "base_path": "gs://bucket" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": [] - } - } -} \ No newline at end of file diff --git a/tests/example_specs/manifests/valid_manifest.json b/tests/example_specs/manifests/valid_manifest.json deleted file mode 100644 index 9bc00c512..000000000 --- a/tests/example_specs/manifests/valid_manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metadata": { - "pipeline_name": "test_pipeline", - "base_path": "gs://bucket", - "run_id": "test_pipeline_12345", - "component_id": "67890" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/cache/42.txt b/tests/example_specs/mock_base_path/example_pipeline/cache/42.txt deleted file mode 100644 index 768ddfb21..000000000 --- a/tests/example_specs/mock_base_path/example_pipeline/cache/42.txt +++ /dev/null @@ -1 +0,0 @@ -tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json b/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json deleted file mode 100644 index 541775f84..000000000 --- a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_1/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json b/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json deleted file mode 100644 index 78cfec59a..000000000 --- a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2023/component_2/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2023", - "component_id": "component_2", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json b/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json deleted file mode 100644 index f00c64aac..000000000 --- a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_1/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2024", - "component_id": "component_1", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json b/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json deleted file mode 100644 index f7a6f429d..000000000 --- a/tests/example_specs/mock_base_path/example_pipeline/example_pipeline_2024/component_2/manifest.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "metadata": { - "pipeline_name": "example_pipeline", - "base_path": "tests/example_data/subsets_input/mock_base_path", - "run_id": "example_pipeline_2024", - "component_id": "component_2", - "cache_key": "42" - }, - "index": { - "location": "/index" - }, - "subsets": { - "images": { - "location": "/images", - "fields": { - "data": { - "type": "binary" - }, - "height": { - "type": "int32" - }, - "width": { - "type": "int32" - } - } - }, - "captions": { - "location": "/captions", - "fields": { - "data": { - "type": "binary" - } - } - } - } -} \ No newline at end of file diff --git a/tests/example_component/Dockerfile b/tests/examples/example_component/Dockerfile similarity index 100% rename from tests/example_component/Dockerfile rename to tests/examples/example_component/Dockerfile diff --git a/tests/example_component/fondant_component.yaml b/tests/examples/example_component/fondant_component.yaml similarity index 100% rename from tests/example_component/fondant_component.yaml rename to tests/examples/example_component/fondant_component.yaml diff --git a/tests/example_modules/component.py b/tests/examples/example_modules/component.py similarity index 100% rename from tests/example_modules/component.py rename to tests/examples/example_modules/component.py diff --git a/tests/example_modules/invalid_component.py b/tests/examples/example_modules/invalid_component.py similarity index 100% rename from tests/example_modules/invalid_component.py rename to tests/examples/example_modules/invalid_component.py diff --git a/tests/example_modules/invalid_double_components.py b/tests/examples/example_modules/invalid_double_components.py similarity index 100% rename from tests/example_modules/invalid_double_components.py rename to tests/examples/example_modules/invalid_double_components.py diff --git a/tests/example_modules/invalid_double_pipeline.py b/tests/examples/example_modules/invalid_double_pipeline.py similarity index 100% rename from tests/example_modules/invalid_double_pipeline.py rename to tests/examples/example_modules/invalid_double_pipeline.py diff --git a/tests/example_modules/pipeline.py b/tests/examples/example_modules/pipeline.py similarity index 100% rename from tests/example_modules/pipeline.py rename to tests/examples/example_modules/pipeline.py diff --git a/tests/sample_pipeline_test/components/dummy_component/Dockerfile b/tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/Dockerfile rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile diff --git a/tests/sample_pipeline_test/components/dummy_component/README.md b/tests/integration_tests/sample_pipeline_test/components/dummy_component/README.md similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/README.md rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/README.md diff --git a/tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml b/tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml similarity index 73% rename from tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml index 1091703eb..0a041fa3d 100644 --- a/tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml +++ b/tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml @@ -4,13 +4,9 @@ description: Dummy component for testing custom components image: fndnt/dummy_component:dev consumes: - text: - fields: - data: + text_data: type: string produces: - text: - fields: - data: + text_data: type: string \ No newline at end of file diff --git a/tests/sample_pipeline_test/components/dummy_component/requirements.txt b/tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/requirements.txt rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt diff --git a/tests/sample_pipeline_test/components/dummy_component/src/main.py b/tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py similarity index 100% rename from tests/sample_pipeline_test/components/dummy_component/src/main.py rename to tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py diff --git a/tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml b/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml similarity index 95% rename from tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml rename to tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml index 35c43aadb..eddb6e580 100644 --- a/tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml +++ b/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml @@ -3,9 +3,7 @@ description: Component that loads a dataset from a parquet uri image: fndnt/load_from_parquet:dev produces: - text: - fields: - data: + text_data: type: string args: diff --git a/tests/sample_pipeline_test/data/sample.parquet b/tests/integration_tests/sample_pipeline_test/data/sample.parquet similarity index 100% rename from tests/sample_pipeline_test/data/sample.parquet rename to tests/integration_tests/sample_pipeline_test/data/sample.parquet diff --git a/tests/test_sample_pipeline.py b/tests/integration_tests/test_sample_pipeline.py similarity index 91% rename from tests/test_sample_pipeline.py rename to tests/integration_tests/test_sample_pipeline.py index fefc65531..8e7f6fbda 100644 --- a/tests/test_sample_pipeline.py +++ b/tests/integration_tests/test_sample_pipeline.py @@ -17,7 +17,7 @@ # work around to make test executable on M1 Macbooks os.environ["DOCKER_DEFAULT_PLATFORM"] = "linux/amd64" -BASE_PATH = Path("./tests/sample_pipeline_test") +BASE_PATH = Path("./tests/integration_tests/sample_pipeline_test") NUMBER_OF_COMPONENTS = 3 @@ -57,6 +57,7 @@ def sample_pipeline(data_dir="./data") -> Pipeline: return pipeline +@pytest.mark.skip(reason="Skipping due to random failure.") def test_local_runner(sample_pipeline, tmp_path_factory): with tmp_path_factory.mktemp("temp") as data_dir: sample_pipeline.base_path = str(data_dir) @@ -64,7 +65,8 @@ def test_local_runner(sample_pipeline, tmp_path_factory): sample_pipeline, output_path="docker-compose.yaml", extra_volumes=[ - str(Path("tests/sample_pipeline_test/data").resolve()) + ":/data", + str(Path("tests/integration_tests/sample_pipeline_test/data").resolve()) + + ":/data", ], ) DockerRunner().run("docker-compose.yaml") diff --git a/tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml b/tests/pipeline/examples/pipelines/compiled_pipeline/kubeflow_pipeline.yml similarity index 100% rename from tests/example_pipelines/compiled_pipeline/kubeflow_pipeline.yml rename to tests/pipeline/examples/pipelines/compiled_pipeline/kubeflow_pipeline.yml diff --git a/tests/example_specs/component_specs/invalid_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml similarity index 63% rename from tests/example_specs/component_specs/invalid_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml index 3fc8128b5..066519825 100644 --- a/tests/example_specs/component_specs/invalid_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/first_component/fondant_component.yaml @@ -1,16 +1,16 @@ -name: Example component +name: First component description: This is an example component image: example_component:latest consumes: - images: - data: binary + images_data: + type: binary produces: - captions: - data: string + captions_data: + type: string -Arguments: +args: storage_args: description: Storage arguments type: str \ No newline at end of file diff --git a/tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml similarity index 55% rename from tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml index 2f9907df1..e9b67d68e 100644 --- a/tests/example_pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_1/second_component/fondant_component.yaml @@ -3,18 +3,17 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary + + caption_data: + type: string produces: - embeddings: - fields: - data: - type: array - items: - type: float32 + embeddings_data: + type: array + items: + type: float32 args: storage_args: diff --git a/tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml similarity index 61% rename from tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml index 18ea49b2c..053b4c5b5 100644 --- a/tests/example_pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml @@ -2,17 +2,16 @@ name: First component description: This is an example component image: example_component:latest -produces: - images: - fields: - data: - type: binary +consumes: + images_data: + type: binary - captions: - fields: - data: - type: string +produces: + captions_data: + type: string + images_data: + type: binary args: storage_args: description: Storage arguments diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml new file mode 100644 index 000000000..a1a7995a2 --- /dev/null +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_2/second_component/fondant_component.yaml @@ -0,0 +1,24 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + captions_description: + type: binary + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml similarity index 53% rename from tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml rename to tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml index 45964a8c6..053b4c5b5 100644 --- a/tests/example_pipelines/invalid_pipeline/example_2/first_component/fondant_component.yaml +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/first_component/fondant_component.yaml @@ -3,21 +3,15 @@ description: This is an example component image: example_component:latest consumes: - images: - fields: - data: - type: binary + images_data: + type: binary produces: - captions: - fields: - data: - type: string + captions_data: + type: string - images: - fields: - data: - type: binary + images_data: + type: binary args: storage_args: description: Storage arguments diff --git a/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml new file mode 100644 index 000000000..8e0517f0a --- /dev/null +++ b/tests/pipeline/examples/pipelines/invalid_pipeline/example_3/second_component/fondant_component.yaml @@ -0,0 +1,21 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: string + + captions_data: + type: string + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/first_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml new file mode 100644 index 000000000..0841688e9 --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/first_component/fondant_component.yaml @@ -0,0 +1,15 @@ +name: First component +description: This is an example component +image: example_component:latest + +produces: + images_data: + type: binary + + captions_data: + type: string + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/fourth_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml new file mode 100644 index 000000000..1cef340bd --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/fourth_component/fondant_component.yaml @@ -0,0 +1,29 @@ +name: Fourth component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + embeddings_data: + type: array + items: + type: float32 + +produces: + images_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str + some_list: + description: Some list + type: list + items: + type: int \ No newline at end of file diff --git a/tests/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/second_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml new file mode 100644 index 000000000..fa328ae01 --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/second_component/fondant_component.yaml @@ -0,0 +1,18 @@ +name: Second component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + +produces: + embeddings_data: + type: array + items: + type: float32 + +args: + storage_args: + description: Storage arguments + type: str \ No newline at end of file diff --git a/tests/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/Dockerfile similarity index 100% rename from tests/example_pipelines/valid_pipeline/example_1/third_component/Dockerfile rename to tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/Dockerfile diff --git a/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml new file mode 100644 index 000000000..fb6ebbaa0 --- /dev/null +++ b/tests/pipeline/examples/pipelines/valid_pipeline/example_1/third_component/fondant_component.yaml @@ -0,0 +1,24 @@ +name: Third component +description: This is an example component +image: example_component:latest + +consumes: + images_data: + type: binary + + captions_data: + type: string + + embeddings_data: + type: array + items: + type: float32 + +produces: + images_data: + type: binary + +args: + storage_args: + description: Storage arguments + type: str diff --git a/tests/test_compiler.py b/tests/pipeline/test_compiler.py similarity index 99% rename from tests/test_compiler.py rename to tests/pipeline/test_compiler.py index 903c7963c..2c34f7f4e 100644 --- a/tests/test_compiler.py +++ b/tests/pipeline/test_compiler.py @@ -20,9 +20,9 @@ VertexPipelineConfigs, ) -COMPONENTS_PATH = Path("./tests/example_pipelines/valid_pipeline") +COMPONENTS_PATH = Path("./tests/pipeline/examples/pipelines/valid_pipeline") -VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") +VALID_PIPELINE = Path("./tests/pipeline/examples/pipelines/compiled_pipeline/") TEST_PIPELINES = [ ( diff --git a/tests/test_pipeline.py b/tests/pipeline/test_pipeline.py similarity index 98% rename from tests/test_pipeline.py rename to tests/pipeline/test_pipeline.py index 37d421ef6..b4deebc97 100644 --- a/tests/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -8,8 +8,8 @@ from fondant.core.exceptions import InvalidPipelineDefinition from fondant.pipeline import ComponentOp, Pipeline, Resources -valid_pipeline_path = Path(__file__).parent / "example_pipelines/valid_pipeline" -invalid_pipeline_path = Path(__file__).parent / "example_pipelines/invalid_pipeline" +valid_pipeline_path = Path(__file__).parent / "examples/pipelines/valid_pipeline" +invalid_pipeline_path = Path(__file__).parent / "examples/pipelines/invalid_pipeline" def yaml_file_to_dict(file_path): diff --git a/tests/test_runner.py b/tests/pipeline/test_runner.py similarity index 98% rename from tests/test_runner.py rename to tests/pipeline/test_runner.py index 84ad63304..011f65e55 100644 --- a/tests/test_runner.py +++ b/tests/pipeline/test_runner.py @@ -11,7 +11,7 @@ VertexRunner, ) -VALID_PIPELINE = Path("./tests/example_pipelines/compiled_pipeline/") +VALID_PIPELINE = Path("./tests/pipeline/examples/pipelines/compiled_pipeline/") def test_docker_runner(): diff --git a/tests/test_cli.py b/tests/test_cli.py index 7897719aa..61fa8630f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -55,16 +55,16 @@ def test_basic_invocation(command): @pytest.mark.parametrize( "module_str", [ - "example_modules.component", - "example_modules/component", - "example_modules.component.py", - "example_modules/component.py", + "examples.example_modules.component", + "examples.example_modules/component", + "examples.example_modules.component.py", + "examples.example_modules/component.py", ], ) def test_get_module(module_str): """Test get module method.""" module = get_module(module_str) - assert module.__name__ == "example_modules.component" + assert module.__name__ == "examples.example_modules.component" def test_get_module_error(): @@ -77,7 +77,7 @@ def test_get_module_error(): "module_str", [ __name__, # cannot be split - "example_modules.component", # module does not exist + "examples.example_modules.component", # module does not exist ], ) def test_component_from_module(module_str): @@ -89,8 +89,10 @@ def test_component_from_module(module_str): @pytest.mark.parametrize( "module_str", [ - "example_modules.invalid_component", # module contains more than one component class - "example_modules.invalid_double_components", # module does not contain a component class + # module contains more than one component class + "examples.example_modules.invalid_component", + # module does not contain a component class + "examples.example_modules.invalid_double_components", ], ) def test_component_from_module_error(module_str): @@ -103,7 +105,7 @@ def test_component_from_module_error(module_str): "module_str", [ __name__, - "example_modules.pipeline", + "examples.example_modules.pipeline", ], ) def test_pipeline_from_module(module_str): @@ -115,8 +117,10 @@ def test_pipeline_from_module(module_str): @pytest.mark.parametrize( "module_str", [ - "example_modules.component", # module does not contain a pipeline instance - "example_modules.invalid_double_pipeline", # module contains many pipeline instances + # module does not contain a pipeline instance + "examples.example_modules.component", + # module contains many pipeline instances + "examples.example_modules.invalid_double_pipeline", ], ) def test_pipeline_from_module_error(module_str): @@ -417,7 +421,7 @@ def test_vertex_run(tmp_path_factory): def test_component_build(mock_build, mock_push): """Test that the build command works as expected.""" args = argparse.Namespace( - component_dir=Path(__file__).parent / "example_component", + component_dir=Path(__file__).parent / "examples/example_component", tag="image:test", build_arg=["key=value"], nocache=True, @@ -435,7 +439,7 @@ def test_component_build(mock_build, mock_push): # Check that docker build and push were executed correctly mock_build.assert_called_with( - path=str(Path(__file__).parent / "example_component"), + path=str(Path(__file__).parent / "examples/example_component"), tag="image:test", buildargs={"key": "value"}, nocache=True, @@ -449,7 +453,7 @@ def test_component_build(mock_build, mock_push): # Check that the component specification file was updated correctly with open( - Path(__file__).parent / "example_component" / "fondant_component.yaml", + Path(__file__).parent / "examples/example_component" / "fondant_component.yaml", "r+", ) as f: content = f.read() diff --git a/tox.ini b/tox.ini index acd58f104..d22216b49 100644 --- a/tox.ini +++ b/tox.ini @@ -48,6 +48,6 @@ commands_pre= poetry install --all-extras poetry show commands= - poetry run python -m pytest tests -vv --cov fondant --cov-report term-missing + poetry run python -m pytest tests -vv --cov fondant --cov-report term-missing --ignore=tests/integration_tests commands_post= bash ./scripts/post-build.sh