diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index 11deb2329..f3014fccb 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -32,6 +32,26 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} COVERALLS_FLAG_NAME: test-${{ matrix.python-version }} + integration-test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ '3.8', '3.9', '3.10' ] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip + pip install poetry==1.4.0 + poetry install --all-extras --with test + + - name: Execute sample pipeline + run: ./scripts/run_integration_tests.sh $GITHUB_SHA + finish-coveralls: needs: test runs-on: ubuntu-latest diff --git a/docs/runners/local.md b/docs/runners/local.md index b412b825b..8e9584d77 100644 --- a/docs/runners/local.md +++ b/docs/runners/local.md @@ -46,7 +46,7 @@ about this in the [installation](../guides/installation.md) guide. fondant run local --auth-azure ``` - You can also use the `--extra_volumes` argument to mount extra credentials or additional files. + You can also use the `--extra-volumes` argument to mount extra credentials or additional files. This volumes will be mounted to every component/service of the docker-compose spec. diff --git a/examples/sample_pipeline/README.md b/examples/sample_pipeline/README.md new file mode 100644 index 000000000..6ab76c9eb --- /dev/null +++ b/examples/sample_pipeline/README.md @@ -0,0 +1,13 @@ +# Sample pipeline + +This example is a simple sample pipeline which uses two reusable components +(load_from_parquet, chunk_text), and a custom dummy component. The custom dummy component only +returns the received dataframe. + +The pipeline can be executed with the Fondant cli: + +```bash +fondant run local pipeline.py +``` + +The automated integration test will use the `run.sh` script. \ No newline at end of file diff --git a/tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile b/examples/sample_pipeline/components/dummy_component/Dockerfile similarity index 64% rename from tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile rename to examples/sample_pipeline/components/dummy_component/Dockerfile index c39ada80e..e15bbb588 100644 --- a/tests/integration_tests/sample_pipeline_test/components/dummy_component/Dockerfile +++ b/examples/sample_pipeline/components/dummy_component/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.8-slim as base +FROM --platform=linux/amd64 python:3.8-slim # System dependencies RUN apt-get update && \ @@ -6,19 +6,17 @@ RUN apt-get update && \ apt-get install git -y # Install requirements -COPY requirements.txt / +COPY requirements.txt ./ RUN pip3 install --no-cache-dir -r requirements.txt -# Install Fondant -# This is split from other requirements to leverage caching +# Install fondant ARG FONDANT_VERSION=main RUN pip3 install fondant[component,aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder -WORKDIR /component -COPY src/ src/ -ENV PYTHONPATH "${PYTHONPATH}:./src" - -FROM base WORKDIR /component/src + +# Copy over src-files and spec of the component +COPY src/ . + ENTRYPOINT ["fondant", "execute", "main"] \ No newline at end of file diff --git a/tests/integration_tests/sample_pipeline_test/components/dummy_component/README.md b/examples/sample_pipeline/components/dummy_component/README.md similarity index 100% rename from tests/integration_tests/sample_pipeline_test/components/dummy_component/README.md rename to examples/sample_pipeline/components/dummy_component/README.md diff --git a/tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml b/examples/sample_pipeline/components/dummy_component/fondant_component.yaml similarity index 83% rename from tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml rename to examples/sample_pipeline/components/dummy_component/fondant_component.yaml index 0a041fa3d..ada48083e 100644 --- a/tests/integration_tests/sample_pipeline_test/components/dummy_component/fondant_component.yaml +++ b/examples/sample_pipeline/components/dummy_component/fondant_component.yaml @@ -1,7 +1,6 @@ name: Dummy component description: Dummy component for testing custom components - -image: fndnt/dummy_component:dev +image: dummy_component consumes: text_data: diff --git a/examples/sample_pipeline/components/dummy_component/requirements.txt b/examples/sample_pipeline/components/dummy_component/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py b/examples/sample_pipeline/components/dummy_component/src/main.py similarity index 90% rename from tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py rename to examples/sample_pipeline/components/dummy_component/src/main.py index bf0ddedcd..f17dad2eb 100644 --- a/tests/integration_tests/sample_pipeline_test/components/dummy_component/src/main.py +++ b/examples/sample_pipeline/components/dummy_component/src/main.py @@ -16,9 +16,10 @@ class DummyComponent(PandasTransformComponent): """Dummy component that returns the dataframe as it is.""" - def __init__(self, *_): + def __init__(self, *_, **kwargs): pass def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: """Dummy component that returns the dataframe as it is.""" + # raise RuntimeError return dataframe diff --git a/tests/integration_tests/sample_pipeline_test/data/sample.parquet b/examples/sample_pipeline/data/sample.parquet similarity index 100% rename from tests/integration_tests/sample_pipeline_test/data/sample.parquet rename to examples/sample_pipeline/data/sample.parquet diff --git a/examples/sample_pipeline/pipeline.py b/examples/sample_pipeline/pipeline.py new file mode 100644 index 000000000..b0acd9682 --- /dev/null +++ b/examples/sample_pipeline/pipeline.py @@ -0,0 +1,37 @@ +# This file contains a sample pipeline. Loading data from a parquet file, +# using the load_from_parquet component, chain a custom dummy component, and use +# the reusable chunking component +import pyarrow as pa +from pathlib import Path +from fondant.pipeline import Pipeline + +BASE_PATH = Path("./.artifacts").resolve() +BASE_PATH.mkdir(parents=True, exist_ok=True) + +# Define pipeline +pipeline = Pipeline(name="dummy-pipeline", base_path=str(BASE_PATH)) + +# Load from hub component +load_component_column_mapping = { + "text": "text_data", +} + +dataset = pipeline.read( + name_or_path="load_from_parquet", + arguments={ + "dataset_uri": "/data/sample.parquet", + "column_name_mapping": load_component_column_mapping, + "n_rows_to_load": 5, + }, + produces={"text_data": pa.string()}, +) + +dataset = dataset.apply( + name_or_path="./components/dummy_component", +) + +dataset.apply( + name_or_path="chunk_text", + arguments={"chunk_size": 10, "chunk_overlap": 2}, + consumes={"text": "text_data"}, +) diff --git a/examples/sample_pipeline/run.sh b/examples/sample_pipeline/run.sh new file mode 100644 index 000000000..10a53c731 --- /dev/null +++ b/examples/sample_pipeline/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# This script executes the sample pipeline in the example folder, checks the correct execution and +# cleans up the directory again +set -e +GIT_HASH=$1 + + +# Setup teardown +cleanup() { + rv=$? + + # Try to remove .artifact folder + artifact_directory="./.artifacts" + + if [ -d "$artifact_directory" ]; then + # Directory exists, remove it + # Can't delete files in cicd pipeline due to missing permissions. Not necessarily needed there, + # but might be useful if you executing the script locally. + rm -rf "$artifact_directory" 2>/dev/null || true + fi + + exit $rv +} + +trap cleanup EXIT + +# Bind local data directory to pipeline +data_dir=$(readlink -f "data") + +# Run pipeline +poetry run fondant run local pipeline.py \ + --extra-volumes $data_dir:/data --build-arg FONDANT_VERSION=$GIT_HASH diff --git a/scripts/run_integration_tests.sh b/scripts/run_integration_tests.sh new file mode 100755 index 000000000..0352050e9 --- /dev/null +++ b/scripts/run_integration_tests.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# This script executes the sample pipeline in the example folder, checks the correct execution and +# cleans up the directory again +GIT_HASH=$1 + +echo "Start integration tests execution ..." + +failed_tests=() + +# Find all run.sh scripts and execute them +for test_script in ./examples/*/run.sh; do + test_name=$(basename "$(dirname "$test_script")") + + echo "Running test: $test_name" + + # Set working dir to the currect integration test + cd $(dirname "$test_script") + + # Execute the run.sh script + bash ./run.sh $GIT_HASH + + # Check the exit status + if [ $? -ne 0 ]; then + echo "Test $test_name failed!" + failed_tests+=("$test_name") + fi +done + +echo "Tests completed" + +if [ ${#failed_tests[@]} -eq 0 ]; then + echo "All tests passed!" +else + echo "Failed tests: ${failed_tests[@]}" + exit 1 # Indicate failure to cicd +fi diff --git a/src/fondant/pipeline/pipeline.py b/src/fondant/pipeline/pipeline.py index b1273d6a3..8c1027342 100644 --- a/src/fondant/pipeline/pipeline.py +++ b/src/fondant/pipeline/pipeline.py @@ -472,7 +472,8 @@ def _validate_pipeline_definition(self, run_id: str): msg = ( f"Component '{component_op.name}' is trying to invoke the field " f"'{component_field_name}', which has not been defined or created " - f"in the previous components." + f"in the previous components. \n" + f"Available field names: {list(manifest.fields.keys())}" ) raise InvalidPipelineDefinition( msg, diff --git a/tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt b/tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt deleted file mode 100644 index 54b4390d1..000000000 --- a/tests/integration_tests/sample_pipeline_test/components/dummy_component/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -langchain==0.0.329 \ No newline at end of file diff --git a/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml b/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml deleted file mode 100644 index eddb6e580..000000000 --- a/tests/integration_tests/sample_pipeline_test/components/load_from_parquet/fondant_component.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: Load from parquet -description: Component that loads a dataset from a parquet uri -image: fndnt/load_from_parquet:dev - -produces: - text_data: - type: string - -args: - dataset_uri: - description: The remote path to the parquet file/folder containing the dataset - type: str - column_name_mapping: - description: Mapping of the consumed dataset - type: dict - default: {} - n_rows_to_load: - description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale - type: int - index_column: - description: Column to set index to in the load component, if not specified a default globally unique index will be set - type: str - default: None \ No newline at end of file diff --git a/tests/integration_tests/test_sample_pipeline.py b/tests/integration_tests/test_sample_pipeline.py deleted file mode 100644 index fe1e7ab8f..000000000 --- a/tests/integration_tests/test_sample_pipeline.py +++ /dev/null @@ -1,76 +0,0 @@ -# This file contains a sample pipeline. Loading data from a parquet file, -# using the load_from_parquet component, chain a custom dummy component, and use -# the reusable chunking component -import glob -import logging -import os -from pathlib import Path - -import pytest -from fondant.pipeline import Pipeline -from fondant.pipeline.compiler import DockerCompiler -from fondant.pipeline.runner import DockerRunner - -logger = logging.getLogger(__name__) - - -BASE_PATH = Path("./tests/integration_tests/sample_pipeline_test") -NUMBER_OF_COMPONENTS = 3 - - -@pytest.fixture() -def sample_pipeline(data_dir="./data") -> Pipeline: - # Define pipeline - pipeline = Pipeline(name="dummy-pipeline", base_path=data_dir) - - # Load from hub component - load_component_column_mapping = { - "text": "text_data", - } - - dataset = pipeline.read( - name_or_path=Path(BASE_PATH / "components" / "load_from_parquet"), - arguments={ - "dataset_uri": "/data/sample.parquet", - "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 5, - }, - ) - - dataset = dataset.apply( - name_or_path=Path(BASE_PATH / "components" / "dummy_component"), - ) - - dataset.apply( - name_or_path="chunk_text", - arguments={"chunk_size": 10, "chunk_overlap": 2}, - ) - - return pipeline - - -@pytest.mark.skip(reason="Skipping due to random failure.") -def test_local_runner(sample_pipeline, tmp_path_factory): - with tmp_path_factory.mktemp("temp") as data_dir: - sample_pipeline.base_path = str(data_dir) - DockerCompiler().compile( - sample_pipeline, - output_path="docker-compose.yaml", - extra_volumes=[ - str(Path("tests/integration_tests/sample_pipeline_test/data").resolve()) - + ":/data", - ], - ) - DockerRunner().run("docker-compose.yaml") - - assert os.path.exists(data_dir / "dummy-pipeline") - assert os.path.exists(data_dir / "dummy-pipeline" / "cache") - pipeline_dirs = glob.glob( - str(data_dir / "dummy-pipeline" / "dummy-pipeline-*" / "*"), - ) - - assert len(pipeline_dirs) == NUMBER_OF_COMPONENTS - for dir in pipeline_dirs: - assert os.path.exists(Path(dir) / "index") - assert os.path.exists(Path(dir) / "text") - assert os.path.exists(Path(dir) / "manifest.json")