From 7dcd1170421b07ccfd2441255b24c0a6724a382c Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 25 Jul 2023 13:36:27 +0200 Subject: [PATCH 01/65] More fixes --- components/filter_image_resolution/Dockerfile | 2 +- .../filter_image_resolution/fondant_component.yaml | 2 +- .../components/cluster_image_embeddings/Dockerfile | 2 +- .../components/cluster_image_embeddings/src/main.py | 2 +- .../components/filter_text_complexity/Dockerfile | 2 +- examples/pipelines/datacomp/pipeline.py | 13 +------------ scripts/build_components.sh | 2 +- 7 files changed, 7 insertions(+), 18 deletions(-) diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile index abfa9a414..e36badeaf 100644 --- a/components/filter_image_resolution/Dockerfile +++ b/components/filter_image_resolution/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main +ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index dcac31145..f54507827 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -1,6 +1,6 @@ name: Filter image resolution description: Component that filters images based on minimum size and max aspect ratio -image: ghcr.io/ml6team/filter_image_resolution:latest +image: ghcr.io/ml6team/filter_image_resolution:79df895e9d62d2010ccb8d40ee7e4fd4c68f117d consumes: image: diff --git a/examples/pipelines/datacomp/components/cluster_image_embeddings/Dockerfile b/examples/pipelines/datacomp/components/cluster_image_embeddings/Dockerfile index abfa9a414..e36badeaf 100644 --- a/examples/pipelines/datacomp/components/cluster_image_embeddings/Dockerfile +++ b/examples/pipelines/datacomp/components/cluster_image_embeddings/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main +ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/cluster_image_embeddings/src/main.py b/examples/pipelines/datacomp/components/cluster_image_embeddings/src/main.py index c2ee12c51..6d9b7420c 100644 --- a/examples/pipelines/datacomp/components/cluster_image_embeddings/src/main.py +++ b/examples/pipelines/datacomp/components/cluster_image_embeddings/src/main.py @@ -16,7 +16,7 @@ class ClusterImageEmbeddingsComponent(DaskTransformComponent): """Component that clusters images based on embeddings.""" - def __init__(self, sample_ratio: float, num_clusters: int) -> None: + def __init__(self, *_, sample_ratio: float, num_clusters: int) -> None: self.sample_ratio = sample_ratio self.num_clusters = num_clusters diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile index 397a5f37b..610851a01 100644 --- a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile +++ b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile @@ -12,7 +12,7 @@ RUN python -m spacy download en_core_web_sm # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main +ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 9af28c365..f7a922690 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -7,14 +7,13 @@ from pipeline_configs import PipelineConfigs -from fondant.compiler import DockerCompiler from fondant.pipeline import ComponentOp, Pipeline, Client logger = logging.getLogger(__name__) # Initialize pipeline and client pipeline = Pipeline( - pipeline_name="Datacomp filtering pipeline", + pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", # base_path=PipelineConfigs.BASE_PATH, base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", @@ -69,13 +68,3 @@ pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops - -# compile -if __name__ == "__main__": - compiler = DockerCompiler() - # mount the gcloud credentials to the container - extra_volumes = [ - "$HOME/.config/gcloud/application_default_credentials.json:/root/.config/gcloud/application_default_credentials.json:ro" - ] - compiler.compile(pipeline=pipeline, extra_volumes=extra_volumes) - logger.info("Run `docker compose up` to run the pipeline.") diff --git a/scripts/build_components.sh b/scripts/build_components.sh index a78544802..265d08b83 100755 --- a/scripts/build_components.sh +++ b/scripts/build_components.sh @@ -7,7 +7,7 @@ function usage { echo " -t, --tag Tag to add to image, repeatable The first tag is set in the component specifications" echo " -c, --cache Use registry caching when building the components (default:false)" - echo " -d, --component-dirs Directory containing components to build as subdirectories. + echo " -d, --components-dir Directory containing components to build as subdirectories. The path should be relative to the root directory (default:components)" echo " -n, --namespace The namespace for the built images, should match the github organization (default: ml6team)" echo " -co, --component Specific component to build. Pass the component subdirectory name(s) to build From 05c71564ae873a6454595925f574f5e03c09f3d8 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 25 Jul 2023 16:13:05 +0200 Subject: [PATCH 02/65] More improvements --- .../components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index b89a10324..6eb56e741 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:dev +image: ghcr.io/ml6team/load_from_hf_hub:56b83265bc80e0f98559e58d01d1f18575b85d6b produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index f7a922690..52763a711 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -38,7 +38,7 @@ arguments={ "dataset_name": "nielsr/datacomp-small-with-embeddings", "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 100, + "n_rows_to_load": 500000, }, ) filter_image_resolution_op = ComponentOp.from_registry( @@ -64,7 +64,9 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) -pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) -pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) +# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) +# pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops + +# client.compile_and_run(pipeline=pipeline) \ No newline at end of file From aff3fee38324256cfbc678f6b81d00a88340e967 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 25 Jul 2023 16:13:57 +0200 Subject: [PATCH 03/65] More improvements --- components/load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 0099a92f8..3d6741e46 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:dev +image: ghcr.io/ml6team/load_from_hf_hub:56b83265bc80e0f98559e58d01d1f18575b85d6b produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index a55daf0c9..7b3aaf0c6 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -48,8 +48,7 @@ def load(self) -> dd.DataFrame: # 4) Optional: only return specific amount of rows if self.n_rows_to_load: - dask_df = dask_df.head(self.n_rows_to_load) - dask_df = dd.from_pandas(dask_df, npartitions=1) + dask_df = dask_df.loc[:self.n_rows_to_load] return dask_df From 710685bfed896e04726af6371a20f778eda5ab2e Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 25 Jul 2023 16:33:26 +0200 Subject: [PATCH 04/65] Add logging --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/src/main.py | 4 +++- src/fondant/data_io.py | 6 +++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index abfa9a414..ec5c49504 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main +ARG FONDANT_VERSION=56b83265bc80e0f98559e58d01d1f18575b85d6b RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 7b3aaf0c6..09b9117d6 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -44,10 +44,12 @@ def load(self) -> dd.DataFrame: ) # 3) Rename columns + logger.info("Renaming columns...") dask_df = dask_df.rename(columns=self.column_name_mapping) # 4) Optional: only return specific amount of rows - if self.n_rows_to_load: + if self.n_rows_to_load is not None: + logger.info(f"Loading first {self.n_rows_to_load} rows...") dask_df = dask_df.loc[:self.n_rows_to_load] return dask_df diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 4d8b116d8..409945eac 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -193,6 +193,10 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: write_tasks = [] dataframe = self.partition_written_dataframe(dataframe) + + logger.info("Dataframe number of partitions:", dataframe.npartitions) + + logger.info("Creating write tasks...") dataframe.index = dataframe.index.rename("id").astype("string") @@ -286,6 +290,7 @@ def _create_write_task( A delayed Dask task that uploads the DataFrame to the remote storage location when executed. """ + logging.info(f"Creating write task for: {location}") write_task = dd.to_parquet( dataframe, location, @@ -293,5 +298,4 @@ def _create_write_task( overwrite=False, compute=False, ) - logging.info(f"Creating write task for: {location}") return write_task From c070babd930ed0945ab04e802242c21844443a2b Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 25 Jul 2023 16:43:02 +0200 Subject: [PATCH 05/65] Update dockerfile --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index ec5c49504..6a06fb3fc 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=56b83265bc80e0f98559e58d01d1f18575b85d6b +ARG FONDANT_VERSION=6b2843e3c5c9e2df22ac24a560216eb89211f9ee RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 3d6741e46..92e53c66a 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:56b83265bc80e0f98559e58d01d1f18575b85d6b +image: ghcr.io/ml6team/load_from_hf_hub:6b2843e3c5c9e2df22ac24a560216eb89211f9ee produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 6eb56e741..18af6c11e 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:56b83265bc80e0f98559e58d01d1f18575b85d6b +image: ghcr.io/ml6team/load_from_hf_hub:6b2843e3c5c9e2df22ac24a560216eb89211f9ee produces: image: From 698b92caeae5f3ed6296e580cbe90b5941de33d6 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 25 Jul 2023 17:00:51 +0200 Subject: [PATCH 06/65] Fix dtype --- .../components/load_from_hf_hub/fondant_component.yaml | 4 ++-- examples/pipelines/datacomp/pipeline.py | 6 +++--- src/fondant/schemas/common.json | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 18af6c11e..9b58d5364 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -8,9 +8,9 @@ produces: url: type: string width: - type: int16 + type: int64 height: - type: int16 + type: int64 face_bboxes: type: array items: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 52763a711..4035b66c2 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -69,4 +69,4 @@ # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops -# client.compile_and_run(pipeline=pipeline) \ No newline at end of file +client.compile_and_run(pipeline=pipeline) \ No newline at end of file diff --git a/src/fondant/schemas/common.json b/src/fondant/schemas/common.json index 969ecd1a8..11df4e988 100644 --- a/src/fondant/schemas/common.json +++ b/src/fondant/schemas/common.json @@ -7,6 +7,7 @@ "int8", "int16", "int32", + "int64", "uint8", "uint16", "uint32", From 6ed53842308e2d6411946f9f2cb58231aae69996 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 25 Jul 2023 17:35:32 +0200 Subject: [PATCH 07/65] Update Dockerfile --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 6a06fb3fc..662017ba7 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=6b2843e3c5c9e2df22ac24a560216eb89211f9ee +ARG FONDANT_VERSION=8174eb4a23b742f64a837daf6b5c5b929d70db61 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 92e53c66a..ed83827a6 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:6b2843e3c5c9e2df22ac24a560216eb89211f9ee +image: ghcr.io/ml6team/load_from_hf_hub:8174eb4a23b742f64a837daf6b5c5b929d70db61 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 9b58d5364..4fcafd38c 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:6b2843e3c5c9e2df22ac24a560216eb89211f9ee +image: ghcr.io/ml6team/load_from_hf_hub:8174eb4a23b742f64a837daf6b5c5b929d70db61 produces: image: From f253d9c65412aabc4ce0c745fd5bd2f5bd7f88ed Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 08:59:45 +0200 Subject: [PATCH 08/65] More updates --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 7 ++++--- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 662017ba7..55ec91ab6 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=8174eb4a23b742f64a837daf6b5c5b929d70db61 +ARG FONDANT_VERSION=42bb3f62a4411108e88e6fdc353494cf8fe9d367 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index ed83827a6..165a1793f 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:8174eb4a23b742f64a837daf6b5c5b929d70db61 +image: ghcr.io/ml6team/load_from_hf_hub:42bb3f62a4411108e88e6fdc353494cf8fe9d367 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 4fcafd38c..ea7537864 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:8174eb4a23b742f64a837daf6b5c5b929d70db61 +image: ghcr.io/ml6team/load_from_hf_hub:42bb3f62a4411108e88e6fdc353494cf8fe9d367 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 4035b66c2..bc24046ba 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -37,6 +37,7 @@ component_dir="components/load_from_hf_hub", arguments={ "dataset_name": "nielsr/datacomp-small-with-embeddings", + "image_column_names": [], "column_name_mapping": load_component_column_mapping, "n_rows_to_load": 500000, }, @@ -69,4 +70,4 @@ # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops -client.compile_and_run(pipeline=pipeline) \ No newline at end of file +# client.compile_and_run(pipeline=pipeline) \ No newline at end of file From 2c990c40ac857bf7a7cb42d09627e6ef907276be Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 09:08:53 +0200 Subject: [PATCH 09/65] Update logging --- examples/pipelines/datacomp/pipeline.py | 2 +- src/fondant/data_io.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index bc24046ba..c6b5b3d78 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,7 +39,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 500000, + "n_rows_to_load": 10, }, ) filter_image_resolution_op = ComponentOp.from_registry( diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 409945eac..8b3123067 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -163,20 +163,19 @@ def partition_written_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: """ if self.output_partition_size != "disable": if isinstance(self.output_partition_size, str): - dataframe = dataframe.repartition( - partition_size=self.output_partition_size, - ) logger.info( f"Repartitioning the written data such that the size per partition is approx." f" {self.output_partition_size}", ) - + dataframe = dataframe.repartition( + partition_size=self.output_partition_size, + ) elif self.output_partition_size is None: - dataframe = dataframe.repartition(partition_size="250MB") logger.info( "Repartitioning the written data such that the size per partition is approx." " 250MB. (Automatic repartitioning)", ) + dataframe = dataframe.repartition(partition_size="250MB") else: msg = ( f"{self.output_partition_size} is not a valid argument. Choose either the" From 87df957d892a95d34e5828a3490b6157dc8bb3f7 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 10:35:12 +0200 Subject: [PATCH 10/65] More improvements --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 3 ++- .../components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 4 ++-- src/fondant/data_io.py | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 55ec91ab6..5c85e3b44 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=42bb3f62a4411108e88e6fdc353494cf8fe9d367 +ARG FONDANT_VERSION=a8ba56e7e38468872eb4a9829c77b2b1aa2003e0 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 165a1793f..8e2d92b3a 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:42bb3f62a4411108e88e6fdc353494cf8fe9d367 +image: ghcr.io/ml6team/load_from_hf_hub:a8ba56e7e38468872eb4a9829c77b2b1aa2003e0 produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 09b9117d6..f3931f303 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -50,7 +50,8 @@ def load(self) -> dd.DataFrame: # 4) Optional: only return specific amount of rows if self.n_rows_to_load is not None: logger.info(f"Loading first {self.n_rows_to_load} rows...") - dask_df = dask_df.loc[:self.n_rows_to_load] + dask_df = dask_df.head(self.n_rows_to_load) + dask_df = dd.from_pandas(dask_df, npartitions=1) return dask_df diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index ea7537864..4ebeb261e 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:42bb3f62a4411108e88e6fdc353494cf8fe9d367 +image: ghcr.io/ml6team/load_from_hf_hub:a8ba56e7e38468872eb4a9829c77b2b1aa2003e0 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index c6b5b3d78..c597b5e60 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,7 +39,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 10, + "n_rows_to_load": 500000, }, ) filter_image_resolution_op = ComponentOp.from_registry( @@ -70,4 +70,4 @@ # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops -# client.compile_and_run(pipeline=pipeline) \ No newline at end of file +client.compile_and_run(pipeline=pipeline) \ No newline at end of file diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 8b3123067..2f97252fb 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -193,7 +193,7 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: dataframe = self.partition_written_dataframe(dataframe) - logger.info("Dataframe number of partitions:", dataframe.npartitions) + logger.info(f"Dataframe number of partitions is {dataframe.npartitions}") logger.info("Creating write tasks...") From f702be1c7a981840aacad77fcbcf1369f890d7cb Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 10:44:22 +0200 Subject: [PATCH 11/65] Update specs --- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 8e2d92b3a..f1d849c57 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:a8ba56e7e38468872eb4a9829c77b2b1aa2003e0 +image: ghcr.io/ml6team/load_from_hf_hub:1f1ab27717d70a135f825d9fb97fb1ed038262c9 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 4ebeb261e..712fb48e3 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:a8ba56e7e38468872eb4a9829c77b2b1aa2003e0 +image: ghcr.io/ml6team/load_from_hf_hub:1f1ab27717d70a135f825d9fb97fb1ed038262c9 produces: image: From 7c19cc74da09203a113ff0ffa3b9e43d846c5c63 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 11:08:47 +0200 Subject: [PATCH 12/65] Improve load_from_hf_hub component --- components/load_from_hf_hub/src/main.py | 12 +++++++++++- examples/pipelines/datacomp/pipeline.py | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index f3931f303..e7d2c447b 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -16,6 +16,7 @@ def __init__(self, *_, column_name_mapping: dict, image_column_names: t.Optional[list], n_rows_to_load: t.Optional[int], + dataset_length: int, ) -> None: """ Args: @@ -25,11 +26,14 @@ def __init__(self, *_, format the image from HF hub format to a byte string n_rows_to_load: optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale. + dataset_length: optional argument that specifies the length of the entire dataset. Only + required in case n_rows_to_load is specified. """ self.dataset_name = dataset_name self.column_name_mapping = column_name_mapping self.image_column_names = image_column_names self.n_rows_to_load = n_rows_to_load + self.dataset_length = dataset_length def load(self) -> dd.DataFrame: # 1) Load data, read as Dask dataframe @@ -49,8 +53,14 @@ def load(self) -> dd.DataFrame: # 4) Optional: only return specific amount of rows if self.n_rows_to_load is not None: + if self.dataset_length is None: + raise ValueError("""Make sure to also specify the length of the entire + dataset. This is required as otherwise only the first + partition can be loaded""") logger.info(f"Loading first {self.n_rows_to_load} rows...") - dask_df = dask_df.head(self.n_rows_to_load) + partition_length = self.dataset_length // dask_df.npartitions + npartitions = self.n_rows_to_load // partition_length + dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions) dask_df = dd.from_pandas(dask_df, npartitions=1) return dask_df diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index c597b5e60..43bec4068 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -40,7 +40,9 @@ "image_column_names": [], "column_name_mapping": load_component_column_mapping, "n_rows_to_load": 500000, + "dataset_length": 12800000, }, + node_pool_name="n2-standard-128-pool", ) filter_image_resolution_op = ComponentOp.from_registry( name="filter_image_resolution", From 08bc45a01314eeaab16cb084209133eee7c900e3 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 11:43:08 +0200 Subject: [PATCH 13/65] Update specs --- components/filter_image_resolution/Dockerfile | 2 +- .../filter_image_resolution/fondant_component.yaml | 6 +++--- components/load_from_hf_hub/fondant_component.yaml | 9 +++++++-- components/load_from_hf_hub/src/main.py | 2 +- .../components/load_from_hf_hub/fondant_component.yaml | 6 +++++- examples/pipelines/datacomp/pipeline.py | 6 +++--- 6 files changed, 20 insertions(+), 11 deletions(-) diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile index e36badeaf..bc16d1c64 100644 --- a/components/filter_image_resolution/Dockerfile +++ b/components/filter_image_resolution/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d +ARG FONDANT_VERSION=41bd8f9d8f8003b41ffa375c1887869208e519ea RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index f54507827..096bfb730 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -1,14 +1,14 @@ name: Filter image resolution description: Component that filters images based on minimum size and max aspect ratio -image: ghcr.io/ml6team/filter_image_resolution:79df895e9d62d2010ccb8d40ee7e4fd4c68f117d +image: ghcr.io/ml6team/filter_image_resolution:41bd8f9d8f8003b41ffa375c1887869208e519ea consumes: image: fields: width: - type: int16 + type: int64 height: - type: int16 + type: int64 args: min_image_dim: diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index f1d849c57..5bfacee04 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:1f1ab27717d70a135f825d9fb97fb1ed038262c9 +image: ghcr.io/ml6team/load_from_hf_hub:41bd8f9d8f8003b41ffa375c1887869208e519ea produces: dummy_variable: #TODO: fill in here @@ -23,4 +23,9 @@ args: n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int - default: None \ No newline at end of file + default: None + dataset_length: + description: Optional argument that defines the length of the dataset. Required in case `n_rows_to_load` is specified. + type: int + default: None + \ No newline at end of file diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index e7d2c447b..8fe5266b8 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -57,7 +57,7 @@ def load(self) -> dd.DataFrame: raise ValueError("""Make sure to also specify the length of the entire dataset. This is required as otherwise only the first partition can be loaded""") - logger.info(f"Loading first {self.n_rows_to_load} rows...") + logger.info(f"Loading approximately {self.n_rows_to_load} rows...") partition_length = self.dataset_length // dask_df.npartitions npartitions = self.n_rows_to_load // partition_length dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 712fb48e3..af70ce67d 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:1f1ab27717d70a135f825d9fb97fb1ed038262c9 +image: ghcr.io/ml6team/load_from_hf_hub:41bd8f9d8f8003b41ffa375c1887869208e519ea produces: image: @@ -51,4 +51,8 @@ args: n_rows_to_load: description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale type: int + default: None + dataset_length: + description: Optional argument that defines the length of the dataset. Required in case `n_rows_to_load` is specified. + type: int default: None \ No newline at end of file diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 43bec4068..5dccfa176 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -67,7 +67,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From 0912acc0a3b6516278df6324bfff30ca17c119a0 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 13:28:06 +0200 Subject: [PATCH 14/65] Add task graph --- examples/pipelines/datacomp/pipeline.py | 6 ++++-- src/fondant/data_io.py | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 5dccfa176..84364ae7e 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,8 +39,10 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 500000, + "n_rows_to_load": 10, "dataset_length": 12800000, + # "n_rows_to_load": 500000, + # "dataset_length": 12800000, }, node_pool_name="n2-standard-128-pool", ) @@ -67,7 +69,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 2f97252fb..71eba6e58 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -2,6 +2,7 @@ import os import typing as t +import dask import dask.dataframe as dd from dask.diagnostics import ProgressBar @@ -223,6 +224,8 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: with ProgressBar(): logging.info("Writing data...") + # visualize the low level Dask graph + dask.visualize(*write_tasks, filename='task_graph.png') dd.compute(*write_tasks) @staticmethod From 9d5208de14aa71965e5f81ff979ac2a428aef09f Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 13:36:12 +0200 Subject: [PATCH 15/65] Add graphviz to the dependencies --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 2 +- pyproject.toml | 1 + 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 5c85e3b44..4677f2b9e 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=a8ba56e7e38468872eb4a9829c77b2b1aa2003e0 +ARG FONDANT_VERSION=c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 5bfacee04..05343e40d 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:41bd8f9d8f8003b41ffa375c1887869208e519ea +image: ghcr.io/ml6team/load_from_hf_hub:c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index af70ce67d..ac9fa1a3a 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:41bd8f9d8f8003b41ffa375c1887869208e519ea +image: ghcr.io/ml6team/load_from_hf_hub:c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 84364ae7e..6b1431d67 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,7 +39,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 10, + "n_rows_to_load": 100000, "dataset_length": 12800000, # "n_rows_to_load": 500000, # "dataset_length": 12800000, diff --git a/pyproject.toml b/pyproject.toml index 7b837de9f..b074eb7a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ dask = {extras = ["dataframe"], version = ">= 2023.4.1"} importlib-resources = { version = ">= 1.3", python = "<3.9" } jsonschema = ">= 4.18" pyarrow = ">= 11.0.0" +graphviz = ">= 0.20.1" fsspec = { version = ">= 2023.4.0", optional = true} gcsfs = { version = ">= 2023.4.0", optional = true } From 0d42734a2fd8d2c4ee1c6340b59fddd0bae93f9c Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 14:02:31 +0200 Subject: [PATCH 16/65] Update Dockerfile --- components/load_from_hf_hub/Dockerfile | 5 +++-- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 4677f2b9e..34c2e3a05 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -3,7 +3,8 @@ FROM --platform=linux/amd64 python:3.8-slim # System dependencies RUN apt-get update && \ apt-get upgrade -y && \ - apt-get install git -y + apt-get install git -y && \ + apt-get install graphviz -y # Install requirements COPY requirements.txt / @@ -11,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769 +ARG FONDANT_VERSION=797f1a81694e4d66c4fe39edbbc9fc2dafce830a RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 05343e40d..9310e76d6 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769 +image: ghcr.io/ml6team/load_from_hf_hub:797f1a81694e4d66c4fe39edbbc9fc2dafce830a produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index ac9fa1a3a..34b9b5ee5 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769 +image: ghcr.io/ml6team/load_from_hf_hub:797f1a81694e4d66c4fe39edbbc9fc2dafce830a produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 6b1431d67..ae14144d5 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) From 11a4ec75d8d302c639cc2c1d75001b870e0d35bd Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 14:05:40 +0200 Subject: [PATCH 17/65] Add more --- src/fondant/data_io.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 71eba6e58..4a52aa540 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -225,7 +225,10 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: with ProgressBar(): logging.info("Writing data...") # visualize the low level Dask graph + logging.info("Visualizing task graph...") dask.visualize(*write_tasks, filename='task_graph.png') + for i, task in enumerate(write_tasks): + task.visualize(filename=f'task_{i}.svg') dd.compute(*write_tasks) @staticmethod From 86a133678ba8727393dca1bc1d4c91eac2668e22 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 14:58:19 +0200 Subject: [PATCH 18/65] Add visualize --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- src/fondant/data_io.py | 2 ++ 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 34c2e3a05..a3637ad4b 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=797f1a81694e4d66c4fe39edbbc9fc2dafce830a +ARG FONDANT_VERSION=51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 9310e76d6..29840d667 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:797f1a81694e4d66c4fe39edbbc9fc2dafce830a +image: ghcr.io/ml6team/load_from_hf_hub:51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 34b9b5ee5..a1f0c728e 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:797f1a81694e4d66c4fe39edbbc9fc2dafce830a +image: ghcr.io/ml6team/load_from_hf_hub:51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42 produces: image: diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 4a52aa540..d36412ad4 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -200,6 +200,8 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: dataframe.index = dataframe.index.rename("id").astype("string") + dataframe.visualize(filename=f'dataframe.png') + # Turn index into an empty dataframe so we can write it index_df = dataframe.index.to_frame().drop(columns=["id"]) write_index_task = self._write_subset( From eefd05a7ba5ace9f10bfe3655effbbd9c59912ce Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 15:06:34 +0200 Subject: [PATCH 19/65] More improvements --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index a3637ad4b..d95a42f67 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42 +ARG FONDANT_VERSION=feac9c142c322f1c00e30c4c0f7052dfa6bf3c92 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 29840d667..0e8fbbdb9 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42 +image: ghcr.io/ml6team/load_from_hf_hub:feac9c142c322f1c00e30c4c0f7052dfa6bf3c92 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index a1f0c728e..a57a0da2c 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42 +image: ghcr.io/ml6team/load_from_hf_hub:feac9c142c322f1c00e30c4c0f7052dfa6bf3c92 produces: image: From 88dec539e1d03ddbbba46bd920e1e85fe5b3eb56 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 15:33:44 +0200 Subject: [PATCH 20/65] Fix visualization --- examples/pipelines/datacomp/pipeline.py | 6 ++---- src/fondant/data_io.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index ae14144d5..a22a42479 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,10 +39,8 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 100000, + "n_rows_to_load": 10000, "dataset_length": 12800000, - # "n_rows_to_load": 500000, - # "dataset_length": 12800000, }, node_pool_name="n2-standard-128-pool", ) @@ -69,7 +67,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index d36412ad4..771af8d53 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -200,7 +200,7 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: dataframe.index = dataframe.index.rename("id").astype("string") - dataframe.visualize(filename=f'dataframe.png') + dataframe.visualize(filename=f'{self.manifest.base_path}/graph.png') # Turn index into an empty dataframe so we can write it index_df = dataframe.index.to_frame().drop(columns=["id"]) From a1bcb50358e9bfdbf52f62db7b097cbdd971a493 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 15:58:46 +0200 Subject: [PATCH 21/65] Remove line --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 1 - .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index d95a42f67..7f4286878 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=feac9c142c322f1c00e30c4c0f7052dfa6bf3c92 +ARG FONDANT_VERSION=42a948b606e0c84f1e042e52d207a820a5df48d2 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 0e8fbbdb9..7c7ab69eb 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:feac9c142c322f1c00e30c4c0f7052dfa6bf3c92 +image: ghcr.io/ml6team/load_from_hf_hub:42a948b606e0c84f1e042e52d207a820a5df48d2 produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 8fe5266b8..d4bacf1ba 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -61,7 +61,6 @@ def load(self) -> dd.DataFrame: partition_length = self.dataset_length // dask_df.npartitions npartitions = self.n_rows_to_load // partition_length dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions) - dask_df = dd.from_pandas(dask_df, npartitions=1) return dask_df diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index a57a0da2c..6807bfeb7 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:feac9c142c322f1c00e30c4c0f7052dfa6bf3c92 +image: ghcr.io/ml6team/load_from_hf_hub:42a948b606e0c84f1e042e52d207a820a5df48d2 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index a22a42479..393030241 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -67,7 +67,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From 4ddca85426e231edc71d18dc713e3fb4e3b3c6e8 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 16:46:32 +0200 Subject: [PATCH 22/65] More improvements --- components/load_from_hf_hub/src/main.py | 1 + examples/pipelines/datacomp/pipeline.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index d4bacf1ba..8e3151ea1 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -61,6 +61,7 @@ def load(self) -> dd.DataFrame: partition_length = self.dataset_length // dask_df.npartitions npartitions = self.n_rows_to_load // partition_length dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions) + dask_df = dd.from_pandas(dask_df, npartitions=npartitions).reset_index(drop=True) return dask_df diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 393030241..0017e720e 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,10 +39,11 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 10000, + "n_rows_to_load": 100000, "dataset_length": 12800000, }, node_pool_name="n2-standard-128-pool", + output_partition_size="10MB", ) filter_image_resolution_op = ComponentOp.from_registry( name="filter_image_resolution", @@ -67,7 +68,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From 30fcfe31eca1ed5d58afd49499a59b9c18274965 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 26 Jul 2023 17:23:38 +0200 Subject: [PATCH 23/65] Add print statements --- components/filter_image_resolution/Dockerfile | 5 +++-- .../filter_image_resolution/fondant_component.yaml | 2 +- components/filter_image_resolution/src/main.py | 4 ++++ examples/pipelines/datacomp/pipeline.py | 2 ++ src/fondant/data_io.py | 11 +++++++++++ 5 files changed, 21 insertions(+), 3 deletions(-) diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile index bc16d1c64..3817dfb39 100644 --- a/components/filter_image_resolution/Dockerfile +++ b/components/filter_image_resolution/Dockerfile @@ -3,7 +3,8 @@ FROM --platform=linux/amd64 python:3.8-slim # System dependencies RUN apt-get update && \ apt-get upgrade -y && \ - apt-get install git -y + apt-get install git -y && \ + apt-get install graphviz -y # Install requirements COPY requirements.txt / @@ -11,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=41bd8f9d8f8003b41ffa375c1887869208e519ea +ARG FONDANT_VERSION=1aaf6068e7135b917e5e5cb16b3a2bf9d58f4a4e RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index 096bfb730..442dbe079 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -1,6 +1,6 @@ name: Filter image resolution description: Component that filters images based on minimum size and max aspect ratio -image: ghcr.io/ml6team/filter_image_resolution:41bd8f9d8f8003b41ffa375c1887869208e519ea +image: ghcr.io/ml6team/filter_image_resolution:1aaf6068e7135b917e5e5cb16b3a2bf9d58f4a4e consumes: image: diff --git a/components/filter_image_resolution/src/main.py b/components/filter_image_resolution/src/main.py index c6e0276c0..3ab422ca8 100644 --- a/components/filter_image_resolution/src/main.py +++ b/components/filter_image_resolution/src/main.py @@ -24,6 +24,10 @@ def __init__(self, *_, min_image_dim: int, max_aspect_ratio: float) -> None: self.max_aspect_ratio = max_aspect_ratio def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: + + print("Length of dataframe:", len(dataframe)) + print("First rows of dataframe:", dataframe.head()) + width = dataframe["image"]["width"] height = dataframe["image"]["height"] min_image_dim = np.minimum(width, height) diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 0017e720e..4f38c485f 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -48,6 +48,8 @@ filter_image_resolution_op = ComponentOp.from_registry( name="filter_image_resolution", arguments={"min_image_dim": 200, "max_aspect_ratio": 3}, + node_pool_name="n2-standard-128-pool", + output_partition_size='disable', ) filter_complexity_op = ComponentOp( component_dir="components/filter_text_complexity", diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 771af8d53..ea5318d52 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -93,6 +93,9 @@ def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame: subset_df = dd.read_parquet(remote_path, columns=fields) + logger.info(f"First few rows of subset {subset_name}:") + print(subset_df.head()) + # add subset prefix to columns subset_df = subset_df.rename( columns={col: subset_name + "_" + col for col in subset_df.columns}, @@ -125,7 +128,12 @@ def load_dataframe(self) -> dd.DataFrame: as well as the index columns. """ # load index into dataframe + logging.info(f"Loading index...") dataframe = self._load_index() + + logger.info(f"First few rows of index dataframe:") + print(dataframe.head()) + for name, subset in self.component_spec.consumes.items(): fields = list(subset.fields.keys()) subset_df = self._load_subset(name, fields) @@ -140,6 +148,9 @@ def load_dataframe(self) -> dd.DataFrame: dataframe = self.partition_loaded_dataframe(dataframe) + logger.info(f"First few rows of final dataframe provided to the user:") + print(dataframe.head()) + logging.info(f"Columns of dataframe: {list(dataframe.columns)}") return dataframe From 887f48fcc777d281d1284f3a0887650b68a9187a Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 27 Jul 2023 12:34:37 +0200 Subject: [PATCH 24/65] More improvements --- components/filter_image_resolution/Dockerfile | 2 +- components/filter_image_resolution/fondant_component.yaml | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 8 +++++++- .../components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 2 +- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile index 3817dfb39..f51015301 100644 --- a/components/filter_image_resolution/Dockerfile +++ b/components/filter_image_resolution/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=1aaf6068e7135b917e5e5cb16b3a2bf9d58f4a4e +ARG FONDANT_VERSION=f18dd510b1f106f421d433eb02b512177d5116a3 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index 442dbe079..4f2607bc9 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -1,6 +1,6 @@ name: Filter image resolution description: Component that filters images based on minimum size and max aspect ratio -image: ghcr.io/ml6team/filter_image_resolution:1aaf6068e7135b917e5e5cb16b3a2bf9d58f4a4e +image: ghcr.io/ml6team/filter_image_resolution:f18dd510b1f106f421d433eb02b512177d5116a3 consumes: image: diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 7c7ab69eb..db6da0b01 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:42a948b606e0c84f1e042e52d207a820a5df48d2 +image: ghcr.io/ml6team/load_from_hf_hub:f18dd510b1f106f421d433eb02b512177d5116a3 produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 8e3151ea1..686a282b4 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -61,7 +61,13 @@ def load(self) -> dd.DataFrame: partition_length = self.dataset_length // dask_df.npartitions npartitions = self.n_rows_to_load // partition_length dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions) - dask_df = dd.from_pandas(dask_df, npartitions=npartitions).reset_index(drop=True) + dask_df = dd.from_pandas(dask_df, npartitions=npartitions) + # .reset_index(drop=True) # will reset it from 0 for every partition + + # Set monotonically increasing index + dask_df["id"] = 1 + dask_df["id"] = dask_df.id.cumsum() + dask_df = dask_df.set_index("id", sort=True) return dask_df diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 6807bfeb7..9aa9bdb35 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:42a948b606e0c84f1e042e52d207a820a5df48d2 +image: ghcr.io/ml6team/load_from_hf_hub:f18dd510b1f106f421d433eb02b512177d5116a3 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 4f38c485f..6b3645557 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -70,7 +70,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From ab176415315cb8a42119469835dad4d924a5c9d8 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 27 Jul 2023 13:13:34 +0200 Subject: [PATCH 25/65] More improvements --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 7f4286878..c401acdb4 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=42a948b606e0c84f1e042e52d207a820a5df48d2 +ARG FONDANT_VERSION=fdd0bdac524845a9e0a359916c5a54a9b7518f1d RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index db6da0b01..a99d8ff09 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:f18dd510b1f106f421d433eb02b512177d5116a3 +image: ghcr.io/ml6team/load_from_hf_hub:fdd0bdac524845a9e0a359916c5a54a9b7518f1d produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 9aa9bdb35..20ca7e2ee 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:f18dd510b1f106f421d433eb02b512177d5116a3 +image: ghcr.io/ml6team/load_from_hf_hub:fdd0bdac524845a9e0a359916c5a54a9b7518f1d produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 6b3645557..4f38c485f 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -70,7 +70,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) -# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From 12ba25f625504e947ab6b625fbfe485077d620fb Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 27 Jul 2023 13:33:07 +0200 Subject: [PATCH 26/65] Comment out code --- examples/pipelines/datacomp/pipeline.py | 4 ++-- src/fondant/data_io.py | 17 +++++++---------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 4f38c485f..cbf28fa0d 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,11 +39,11 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 100000, + "n_rows_to_load": 500000, "dataset_length": 12800000, }, node_pool_name="n2-standard-128-pool", - output_partition_size="10MB", + # output_partition_size="10MB", ) filter_image_resolution_op = ComponentOp.from_registry( name="filter_image_resolution", diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index ea5318d52..818d7da53 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -198,20 +198,22 @@ def partition_written_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame: msg, ) + logger.info(f"Dataframe number of partitions is {dataframe.npartitions}") + return dataframe def write_dataframe(self, dataframe: dd.DataFrame) -> None: write_tasks = [] dataframe = self.partition_written_dataframe(dataframe) - - logger.info(f"Dataframe number of partitions is {dataframe.npartitions}") - - logger.info("Creating write tasks...") dataframe.index = dataframe.index.rename("id").astype("string") - dataframe.visualize(filename=f'{self.manifest.base_path}/graph.png') + # logging.info("Visualizing task graph...") + # TODO: doesn't work on GCP + # dataframe.visualize(filename=f'{self.manifest.base_path}/graph.png') + + logger.info("Creating write tasks...") # Turn index into an empty dataframe so we can write it index_df = dataframe.index.to_frame().drop(columns=["id"]) @@ -237,11 +239,6 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: with ProgressBar(): logging.info("Writing data...") - # visualize the low level Dask graph - logging.info("Visualizing task graph...") - dask.visualize(*write_tasks, filename='task_graph.png') - for i, task in enumerate(write_tasks): - task.visualize(filename=f'task_{i}.svg') dd.compute(*write_tasks) @staticmethod From 6e1d318f9fed54a4ebccc3abe99a11604cc98018 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 27 Jul 2023 14:18:44 +0200 Subject: [PATCH 27/65] More improvements --- components/filter_image_resolution/Dockerfile | 2 +- components/filter_image_resolution/fondant_component.yaml | 2 +- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 1 + .../components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 4 ++-- 7 files changed, 8 insertions(+), 7 deletions(-) diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile index f51015301..af0d837c4 100644 --- a/components/filter_image_resolution/Dockerfile +++ b/components/filter_image_resolution/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=f18dd510b1f106f421d433eb02b512177d5116a3 +ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index 4f2607bc9..e441cac78 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -1,6 +1,6 @@ name: Filter image resolution description: Component that filters images based on minimum size and max aspect ratio -image: ghcr.io/ml6team/filter_image_resolution:f18dd510b1f106f421d433eb02b512177d5116a3 +image: ghcr.io/ml6team/filter_image_resolution:28ec87862c160ead773eb15b57905ac61515f8cf consumes: image: diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index c401acdb4..af0d837c4 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=fdd0bdac524845a9e0a359916c5a54a9b7518f1d +ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index a99d8ff09..afe3736fa 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:fdd0bdac524845a9e0a359916c5a54a9b7518f1d +image: ghcr.io/ml6team/load_from_hf_hub:28ec87862c160ead773eb15b57905ac61515f8cf produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 686a282b4..2f4b2f9f7 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -65,6 +65,7 @@ def load(self) -> dd.DataFrame: # .reset_index(drop=True) # will reset it from 0 for every partition # Set monotonically increasing index + logger.info("Setting the index...") dask_df["id"] = 1 dask_df["id"] = dask_df.id.cumsum() dask_df = dask_df.set_index("id", sort=True) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 20ca7e2ee..8015adf5b 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:fdd0bdac524845a9e0a359916c5a54a9b7518f1d +image: ghcr.io/ml6team/load_from_hf_hub:28ec87862c160ead773eb15b57905ac61515f8cf produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index cbf28fa0d..73edc0e05 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) From ef5c323a2768d57fdb892c1426aeef8a075094f9 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 27 Jul 2023 14:21:02 +0200 Subject: [PATCH 28/65] Remove print statements --- components/filter_image_resolution/src/main.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/components/filter_image_resolution/src/main.py b/components/filter_image_resolution/src/main.py index 3ab422ca8..c6e0276c0 100644 --- a/components/filter_image_resolution/src/main.py +++ b/components/filter_image_resolution/src/main.py @@ -24,10 +24,6 @@ def __init__(self, *_, min_image_dim: int, max_aspect_ratio: float) -> None: self.max_aspect_ratio = max_aspect_ratio def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - - print("Length of dataframe:", len(dataframe)) - print("First rows of dataframe:", dataframe.head()) - width = dataframe["image"]["width"] height = dataframe["image"]["height"] min_image_dim = np.minimum(width, height) From cfba11abbd808cd602dfc894c2be5b8fd63486ba Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Fri, 28 Jul 2023 09:38:40 +0200 Subject: [PATCH 29/65] Fix repartioning --- .../components/filter_text_complexity/Dockerfile | 2 +- .../filter_text_complexity/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 2 +- src/fondant/data_io.py | 9 +++------ 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile index 610851a01..b1c6fe14c 100644 --- a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile +++ b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile @@ -12,7 +12,7 @@ RUN python -m spacy download en_core_web_sm # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d +ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml index 7513d5ebf..dfb43a930 100644 --- a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml @@ -1,6 +1,6 @@ name: Filter text complexity description: Component that filters text based on their dependency parse complexity and number of actions -image: ghcr.io/ml6team/filter_text_complexity:latest +image: ghcr.io/ml6team/filter_text_complexity:28ec87862c160ead773eb15b57905ac61515f8cf consumes: text: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 73edc0e05..18f8414b1 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -71,7 +71,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) -# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) +pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 818d7da53..27c9c4036 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -131,13 +131,13 @@ def load_dataframe(self) -> dd.DataFrame: logging.info(f"Loading index...") dataframe = self._load_index() - logger.info(f"First few rows of index dataframe:") - print(dataframe.head()) - for name, subset in self.component_spec.consumes.items(): fields = list(subset.fields.keys()) subset_df = self._load_subset(name, fields) # left joins -> filter on index + # make sure that dataframe has same number of partitions + # as subset + dataframe = dataframe.repartition(npartitions=subset_df.npartitions) dataframe = dd.merge( dataframe, subset_df, @@ -148,9 +148,6 @@ def load_dataframe(self) -> dd.DataFrame: dataframe = self.partition_loaded_dataframe(dataframe) - logger.info(f"First few rows of final dataframe provided to the user:") - print(dataframe.head()) - logging.info(f"Columns of dataframe: {list(dataframe.columns)}") return dataframe From 014543ba0cbc3128ab4e1cd230dfce6d15efd20d Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Fri, 28 Jul 2023 10:55:52 +0200 Subject: [PATCH 30/65] More improvements --- components/filter_image_resolution/Dockerfile | 2 +- components/filter_image_resolution/fondant_component.yaml | 2 +- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/filter_text_complexity/Dockerfile | 2 +- .../components/filter_text_complexity/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile index af0d837c4..177c9bf15 100644 --- a/components/filter_image_resolution/Dockerfile +++ b/components/filter_image_resolution/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf +ARG FONDANT_VERSION=f3f3925b8e8f634e2978e5c7fcefa72c53baba7c RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml index e441cac78..41226e2cb 100644 --- a/components/filter_image_resolution/fondant_component.yaml +++ b/components/filter_image_resolution/fondant_component.yaml @@ -1,6 +1,6 @@ name: Filter image resolution description: Component that filters images based on minimum size and max aspect ratio -image: ghcr.io/ml6team/filter_image_resolution:28ec87862c160ead773eb15b57905ac61515f8cf +image: ghcr.io/ml6team/filter_image_resolution:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c consumes: image: diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index af0d837c4..177c9bf15 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf +ARG FONDANT_VERSION=f3f3925b8e8f634e2978e5c7fcefa72c53baba7c RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index afe3736fa..01f8022f5 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:28ec87862c160ead773eb15b57905ac61515f8cf +image: ghcr.io/ml6team/load_from_hf_hub:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile index b1c6fe14c..c7ede3184 100644 --- a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile +++ b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile @@ -12,7 +12,7 @@ RUN python -m spacy download en_core_web_sm # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf +ARG FONDANT_VERSION=f3f3925b8e8f634e2978e5c7fcefa72c53baba7c RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml index dfb43a930..143032b82 100644 --- a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml @@ -1,6 +1,6 @@ name: Filter text complexity description: Component that filters text based on their dependency parse complexity and number of actions -image: ghcr.io/ml6team/filter_text_complexity:28ec87862c160ead773eb15b57905ac61515f8cf +image: ghcr.io/ml6team/filter_text_complexity:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c consumes: text: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 18f8414b1..c6d771bc5 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) From b63c5cbd0b4028e28355a55ad8f7d833a5e64fb9 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Fri, 28 Jul 2023 11:22:31 +0200 Subject: [PATCH 31/65] More improvements --- .../components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 8015adf5b..87e799c3f 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:28ec87862c160ead773eb15b57905ac61515f8cf +image: ghcr.io/ml6team/load_from_hf_hub:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index c6d771bc5..18f8414b1 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) From ce67179cd3674bf606e58ed8e2fb20b50975a0d3 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 1 Aug 2023 10:45:44 +0200 Subject: [PATCH 32/65] Add download images component --- .../components/download_images/Dockerfile | 23 ++ .../components/download_images/README.md | 12 + .../download_images/fondant_component.yaml | 49 ++++ .../download_images/requirements.txt | 2 + .../components/download_images/src/main.py | 159 +++++++++++ .../components/download_images/src/resizer.py | 258 ++++++++++++++++++ examples/pipelines/datacomp/pipeline.py | 8 +- 7 files changed, 509 insertions(+), 2 deletions(-) create mode 100644 examples/pipelines/datacomp/components/download_images/Dockerfile create mode 100644 examples/pipelines/datacomp/components/download_images/README.md create mode 100644 examples/pipelines/datacomp/components/download_images/fondant_component.yaml create mode 100644 examples/pipelines/datacomp/components/download_images/requirements.txt create mode 100644 examples/pipelines/datacomp/components/download_images/src/main.py create mode 100644 examples/pipelines/datacomp/components/download_images/src/resizer.py diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile new file mode 100644 index 000000000..abfa9a414 --- /dev/null +++ b/examples/pipelines/datacomp/components/download_images/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 python:3.8-slim + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=main +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/download_images/README.md b/examples/pipelines/datacomp/components/download_images/README.md new file mode 100644 index 000000000..f9fb8b866 --- /dev/null +++ b/examples/pipelines/datacomp/components/download_images/README.md @@ -0,0 +1,12 @@ +# download_images + +### Description +This component takes in image URLs as input and downloads the images, along with some metadata (like their height and width). +The images are stored in a new colum as bytes objects. This component also resizes the images using the [resizer](https://github.com/rom1504/img2dataset/blob/main/img2dataset/resizer.py) function from the img2dataset library. + +If the component is unable to retrieve the image at a URL (for any reason), it will return `None` for that particular URL. + +### **Inputs/Outputs** + +See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters. + diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml new file mode 100644 index 000000000..f1e089777 --- /dev/null +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -0,0 +1,49 @@ +name: Download images +description: Component that downloads images based on URLs +image: ghcr.io/ml6team/download_images:dev + +consumes: + image: + fields: + url: + type: string + +produces: + image: + fields: + data: + type: binary + width: + type: int16 + height: + type: int16 + +args: + timeout: + description: Maximum time (in seconds) to wait when trying to download an image + type: int + default: 10 + retries: + description: Number of times to retry downloading an image if it fails. + type: int + default: 0 + image_size: + description: Size of the images after resizing. + type: int + default: 256 + resize_mode: + description: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". + type: str + default: 'border' + resize_only_if_bigger: + description: If True, resize only if image is bigger than image_size. + type: bool + default: 'False' + min_image_size: + description: Minimum size of the images. + type: int + default: 0 + max_aspect_ratio: + description: Maximum aspect ratio of the images. + type: float + default: 'inf' \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/download_images/requirements.txt b/examples/pipelines/datacomp/components/download_images/requirements.txt new file mode 100644 index 000000000..de9d1dfe3 --- /dev/null +++ b/examples/pipelines/datacomp/components/download_images/requirements.txt @@ -0,0 +1,2 @@ +albumentations==1.3.0 +opencv-python-headless>=4.5.5.62,<5 \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py new file mode 100644 index 000000000..64d04e967 --- /dev/null +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -0,0 +1,159 @@ +""" +This component downloads images based on URLs, and resizes them based on various settings like +minimum image size and aspect ratio. + +Some functions here are directly taken from +https://github.com/rom1504/img2dataset/blob/main/img2dataset/downloader.py. +""" +import io +import logging +import traceback +import urllib + +import dask.dataframe as dd +from fondant.component import DaskTransformComponent +from fondant.executor import DaskTransformExecutor +from resizer import Resizer + +logger = logging.getLogger(__name__) + + +def is_disallowed(headers, user_agent_token, disallowed_header_directives): + """Check if HTTP headers contain an X-Robots-Tag directive disallowing usage.""" + for values in headers.get_all("X-Robots-Tag", []): + try: + uatoken_directives = values.split(":", 1) + directives = [x.strip().lower() for x in uatoken_directives[-1].split(",")] + ua_token = ( + uatoken_directives[0].lower() if len(uatoken_directives) == 2 # noqa: PLR2004 + else None + ) + if (ua_token is None or ua_token == user_agent_token) and any( + x in disallowed_header_directives for x in directives + ): + return True + except Exception as err: + traceback.print_exc() + print(f"Failed to parse X-Robots-Tag: {values}: {err}") + return False + + +def download_image(url, timeout, user_agent_token, disallowed_header_directives): + """Download an image with urllib.""" + img_stream = None + user_agent_string = ( + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0" + ) + if user_agent_token: + user_agent_string += f" (compatible; {user_agent_token}; " \ + f"+https://github.com/rom1504/img2dataset)" + try: + request = urllib.request.Request( + url, data=None, headers={"User-Agent": user_agent_string}, + ) + with urllib.request.urlopen(request, timeout=timeout) as r: + if disallowed_header_directives and is_disallowed( + r.headers, + user_agent_token, + disallowed_header_directives, + ): + return None + img_stream = io.BytesIO(r.read()) + return img_stream + except Exception: + if img_stream is not None: + img_stream.close() + return None + + +def download_image_with_retry( + url, + *, + timeout, + retries, + resizer, + user_agent_token=None, + disallowed_header_directives=None, +): + for _ in range(retries + 1): + img_stream = download_image( + url, timeout, user_agent_token, disallowed_header_directives, + ) + if img_stream is not None: + # resize the image + img_str, width, height = resizer(img_stream) + return img_str, width, height + return None, None, None + + +class DownloadImagesComponent(DaskTransformComponent): + """Component that downloads images based on URLs.""" + + def __init__(self, + *_, + timeout: int, + retries: int, + image_size: int, + resize_mode: str, + resize_only_if_bigger: bool, + min_image_size: int, + max_aspect_ratio: float, + ): + """Component that downloads images from a list of URLs and executes filtering and resizing. + + Args: + timeout: Maximum time (in seconds) to wait when trying to download an image. + retries: Number of times to retry downloading an image if it fails. + image_size: Size of the images after resizing. + resize_mode: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". + resize_only_if_bigger: If True, resize only if image is bigger than image_size. + min_image_size: Minimum size of the images. + max_aspect_ratio: Maximum aspect ratio of the images. + + Returns: + Dask dataframe + """ + self.timeout = timeout + self.retries = retries + self.resizer = Resizer( + image_size=image_size, + resize_mode=resize_mode, + resize_only_if_bigger=resize_only_if_bigger, + min_image_size=min_image_size, + max_aspect_ratio=max_aspect_ratio, + ) + + def transform( + self, + dataframe: dd.DataFrame, + ) -> dd.DataFrame: + logger.info("Instantiating resizer...") + + # Remove duplicates from laion retrieval + dataframe = dataframe.drop_duplicates() + + result = dataframe.apply( + lambda example: download_image_with_retry( + url=example.images_url, + timeout=self.timeout, + retries=self.retries, + resizer=self.resizer, + ), + axis=1, + result_type="expand", + meta={0: bytes, 1: int, 2: int}, + ) + + result.columns = [("image", "data"), ("image", "width"), ("image", "height")] + + dataframe = dataframe.merge(result, left_index=True, right_index=True) + + # Remove images that could not be fetched + dataframe = dataframe.dropna() + + return dataframe + + +if __name__ == "__main__": + executor = DaskTransformExecutor.from_args() + executor.execute(DownloadImagesComponent) diff --git a/examples/pipelines/datacomp/components/download_images/src/resizer.py b/examples/pipelines/datacomp/components/download_images/src/resizer.py new file mode 100644 index 000000000..f545a0bf1 --- /dev/null +++ b/examples/pipelines/datacomp/components/download_images/src/resizer.py @@ -0,0 +1,258 @@ +"""resizer module handle image resizing. + +Source: https://github.com/rom1504/img2dataset/blob/main/img2dataset/resizer.py. + +MIT License + +Copyright (c) 2021 Romain Beaumont +""" +# ruff: noqa + +import albumentations as A +import cv2 +import numpy as np +from enum import Enum +import imghdr +import os + +_INTER_STR_TO_CV2 = dict( + nearest=cv2.INTER_NEAREST, + linear=cv2.INTER_LINEAR, + bilinear=cv2.INTER_LINEAR, + cubic=cv2.INTER_CUBIC, + bicubic=cv2.INTER_CUBIC, + area=cv2.INTER_AREA, + lanczos=cv2.INTER_LANCZOS4, + lanczos4=cv2.INTER_LANCZOS4, +) + + +class ResizeMode(Enum): + no = 0 # pylint: disable=invalid-name + keep_ratio = 1 # pylint: disable=invalid-name + center_crop = 2 # pylint: disable=invalid-name + border = 3 # pylint: disable=invalid-name + keep_ratio_largest = 4 # pylint: disable=invalid-name + + +# thanks https://stackoverflow.com/questions/11130156/suppress-stdout-stderr-print-from-python-functions +class SuppressStdoutStderr: + """ + A context manager for doing a "deep suppression" of stdout and stderr in + Python, i.e. will suppress all print, even if the print originates in a + compiled C/Fortran sub-function. + This will not suppress raised exceptions, since exceptions are printed + to stderr just before a script exits, and after the context manager has + exited (at least, I think that is why it lets exceptions through). + + """ + + def __init__(self): + # Open a pair of null files + self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] + # Save the actual stdout (1) and stderr (2) file descriptors. + self.save_fds = [os.dup(1), os.dup(2)] + + def __enter__(self): + # Assign the null pointers to stdout and stderr. + os.dup2(self.null_fds[0], 1) + os.dup2(self.null_fds[1], 2) + + def __exit__(self, *_): + # Re-assign the real stdout/stderr back to (1) and (2) + os.dup2(self.save_fds[0], 1) + os.dup2(self.save_fds[1], 2) + # Close all file descriptors + for fd in self.null_fds + self.save_fds: + os.close(fd) + + +def inter_str_to_cv2(inter_str): + inter_str = inter_str.lower() + if inter_str not in _INTER_STR_TO_CV2: + raise Exception(f"Invalid option for interpolation: {inter_str}") + return _INTER_STR_TO_CV2[inter_str] + + +class Resizer: + """ + Resize images + Expose a __call__ method to be used as a callable object + + Should be used to resize one image at a time + + Options: + resize_mode: "no", "keep_ratio", "center_crop", "border" + resize_only_if_bigger: if True, resize only if image is bigger than image_size + image_size: size of the output image to resize + """ + + def __init__( + self, + image_size, + resize_mode, + resize_only_if_bigger, + upscale_interpolation="lanczos", + downscale_interpolation="area", + encode_quality=95, + encode_format="jpg", + skip_reencode=False, + disable_all_reencoding=False, + min_image_size=0, + max_image_area=float("inf"), + max_aspect_ratio=float("inf"), + blurrer=None, + ): + if encode_format not in ["jpg", "png", "webp"]: + raise ValueError(f"Invalid encode format {encode_format}") + if encode_format == "png": + if encode_quality < 0 or encode_quality > 9: + raise ValueError( + "For png, encode quality represents compression which" + f"must be between 0 and 9, got {encode_quality}" + ) + + self.image_size = image_size + if isinstance(resize_mode, str): + if ( + resize_mode not in ResizeMode.__members__ + ): # pylint: disable=unsupported-membership-test + raise Exception(f"Invalid option for resize_mode: {resize_mode}") + resize_mode = ResizeMode[resize_mode] + self.resize_mode = resize_mode + self.resize_only_if_bigger = resize_only_if_bigger + self.upscale_interpolation = inter_str_to_cv2(upscale_interpolation) + self.downscale_interpolation = inter_str_to_cv2(downscale_interpolation) + self.encode_format = encode_format + cv2_img_quality = None + if encode_format == "jpg": + cv2_img_quality = int(cv2.IMWRITE_JPEG_QUALITY) + self.what_ext = "jpeg" + elif encode_format == "png": + cv2_img_quality = int(cv2.IMWRITE_PNG_COMPRESSION) + self.what_ext = "png" + elif encode_format == "webp": + cv2_img_quality = int(cv2.IMWRITE_WEBP_QUALITY) + self.what_ext = "webp" + if cv2_img_quality is None: + raise Exception(f"Invalid option for encode_format: {encode_format}") + self.encode_params = [cv2_img_quality, encode_quality] + self.skip_reencode = skip_reencode + self.disable_all_reencoding = disable_all_reencoding + self.min_image_size = min_image_size + self.max_image_area = max_image_area + self.max_aspect_ratio = max_aspect_ratio + self.blurrer = blurrer + + def __call__(self, img_stream, blurring_bbox_list=None): + """ + input: an image stream, optionally a list of bounding boxes to blur. + output: img_str, width, height + """ + try: + if self.disable_all_reencoding: + return img_stream.read(), None, None, None, None, None + with SuppressStdoutStderr(): + cv2.setNumThreads(1) + img_stream.seek(0) + encode_needed = ( + imghdr.what(img_stream) != self.what_ext + if self.skip_reencode + else True + ) + img_stream.seek(0) + img_buf = np.frombuffer(img_stream.read(), np.uint8) + img = cv2.imdecode(img_buf, cv2.IMREAD_UNCHANGED) + if img is None: + raise Exception("Image decoding error") + if len(img.shape) == 3 and img.shape[-1] == 4: + # alpha matting with white background + alpha = img[:, :, 3, np.newaxis] + img = alpha / 255 * img[..., :3] + 255 - alpha + img = np.rint(img.clip(min=0, max=255)).astype(np.uint8) + encode_needed = True + original_height, original_width = img.shape[:2] + # check if image is too small + if min(original_height, original_width) < self.min_image_size: + return None, None, None + if original_height * original_width > self.max_image_area: + return None, None, None + # check if wrong aspect ratio + if ( + max(original_height, original_width) + / min(original_height, original_width) + > self.max_aspect_ratio + ): + return None, None, None + + # check if resizer was defined during init if needed + if blurring_bbox_list is not None and self.blurrer is None: + return None, None, None + + # Flag to check if blurring is still needed. + maybe_blur_still_needed = True + + # resizing in following conditions + if self.resize_mode in (ResizeMode.keep_ratio, ResizeMode.center_crop): + downscale = min(original_width, original_height) > self.image_size + if not self.resize_only_if_bigger or downscale: + interpolation = ( + self.downscale_interpolation + if downscale + else self.upscale_interpolation + ) + img = A.smallest_max_size( + img, self.image_size, interpolation=interpolation + ) + if blurring_bbox_list is not None and self.blurrer is not None: + img = self.blurrer(img=img, bbox_list=blurring_bbox_list) + if self.resize_mode == ResizeMode.center_crop: + img = A.center_crop(img, self.image_size, self.image_size) + encode_needed = True + maybe_blur_still_needed = False + elif self.resize_mode in ( + ResizeMode.border, + ResizeMode.keep_ratio_largest, + ): + downscale = max(original_width, original_height) > self.image_size + if not self.resize_only_if_bigger or downscale: + interpolation = ( + self.downscale_interpolation + if downscale + else self.upscale_interpolation + ) + img = A.longest_max_size( + img, self.image_size, interpolation=interpolation + ) + if blurring_bbox_list is not None and self.blurrer is not None: + img = self.blurrer(img=img, bbox_list=blurring_bbox_list) + if self.resize_mode == ResizeMode.border: + img = A.pad( + img, + self.image_size, + self.image_size, + border_mode=cv2.BORDER_CONSTANT, + value=[255, 255, 255], + ) + encode_needed = True + maybe_blur_still_needed = False + + # blur parts of the image if needed + if ( + maybe_blur_still_needed + and blurring_bbox_list is not None + and self.blurrer is not None + ): + img = self.blurrer(img=img, bbox_list=blurring_bbox_list) + + height, width = img.shape[:2] + if encode_needed: + img_str = cv2.imencode( + f".{self.encode_format}", img, params=self.encode_params + )[1].tobytes() + else: + img_str = img_buf.tobytes() + return img_str, width, height + + except Exception as err: # pylint: disable=broad-except + return None, None, None diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 18f8414b1..10022ddc8 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -67,11 +67,15 @@ "num_clusters": 3, }, ) +download_images_op = ComponentOp( + component_dir="components/download_images", +) # add ops to pipeline pipeline.add_op(load_from_hub_op) -pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) -pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) +# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) +# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) +pipeline.add_op(download_images_op, dependencies=load_from_hub_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From 3d9f1197bd8cbe07c1adf61189e2cc78ad9deb66 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 1 Aug 2023 11:49:26 +0200 Subject: [PATCH 33/65] Update script --- .../load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 5 +++-- .../components/download_images/Dockerfile | 2 +- .../download_images/fondant_component.yaml | 2 +- .../components/download_images/src/main.py | 21 ++++++------------- .../load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 2 +- 7 files changed, 14 insertions(+), 22 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 01f8022f5..05b65a56e 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c +image: ghcr.io/ml6team/load_from_hf_hub:3c9ea91ff9221286f9a228c61c3cea44e5499a12 produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 2f4b2f9f7..a2103dbbf 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -57,9 +57,10 @@ def load(self) -> dd.DataFrame: raise ValueError("""Make sure to also specify the length of the entire dataset. This is required as otherwise only the first partition can be loaded""") - logger.info(f"Loading approximately {self.n_rows_to_load} rows...") + logger.info(f"""Loading approximately {self.n_rows_to_load} rows... + at least one partition""") partition_length = self.dataset_length // dask_df.npartitions - npartitions = self.n_rows_to_load // partition_length + npartitions = max(self.n_rows_to_load // partition_length, 1) dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions) dask_df = dd.from_pandas(dask_df, npartitions=npartitions) # .reset_index(drop=True) # will reset it from 0 for every partition diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile index abfa9a414..dcdb7bb29 100644 --- a/examples/pipelines/datacomp/components/download_images/Dockerfile +++ b/examples/pipelines/datacomp/components/download_images/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=main +ARG FONDANT_VERSION=3c9ea91ff9221286f9a228c61c3cea44e5499a12 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index f1e089777..9f800040b 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:dev +image: ghcr.io/ml6team/download_images:3c9ea91ff9221286f9a228c61c3cea44e5499a12 consumes: image: diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index 64d04e967..c6b89e392 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -11,8 +11,8 @@ import urllib import dask.dataframe as dd -from fondant.component import DaskTransformComponent -from fondant.executor import DaskTransformExecutor +from fondant.component import PandasTransformComponent +from fondant.executor import PandasTransformExecutor from resizer import Resizer logger = logging.getLogger(__name__) @@ -86,7 +86,7 @@ def download_image_with_retry( return None, None, None -class DownloadImagesComponent(DaskTransformComponent): +class DownloadImagesComponent(PandasTransformComponent): """Component that downloads images based on URLs.""" def __init__(self, @@ -129,25 +129,16 @@ def transform( ) -> dd.DataFrame: logger.info("Instantiating resizer...") - # Remove duplicates from laion retrieval - dataframe = dataframe.drop_duplicates() - - result = dataframe.apply( + dataframe[[("image", "data"), ("image", "width"), ("image", "height")]] = dataframe.apply( lambda example: download_image_with_retry( - url=example.images_url, + url=example.image.url, timeout=self.timeout, retries=self.retries, resizer=self.resizer, ), axis=1, - result_type="expand", - meta={0: bytes, 1: int, 2: int}, ) - result.columns = [("image", "data"), ("image", "width"), ("image", "height")] - - dataframe = dataframe.merge(result, left_index=True, right_index=True) - # Remove images that could not be fetched dataframe = dataframe.dropna() @@ -155,5 +146,5 @@ def transform( if __name__ == "__main__": - executor = DaskTransformExecutor.from_args() + executor = PandasTransformExecutor.from_args() executor.execute(DownloadImagesComponent) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 87e799c3f..072ae0996 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c +image: ghcr.io/ml6team/load_from_hf_hub:3c9ea91ff9221286f9a228c61c3cea44e5499a12 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 10022ddc8..b43228df9 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,7 +39,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 500000, + "n_rows_to_load": 10000, "dataset_length": 12800000, }, node_pool_name="n2-standard-128-pool", From e288749a9f126ee51574b86cd0b3f2b1677f7cd7 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 1 Aug 2023 12:06:33 +0200 Subject: [PATCH 34/65] Remove graphviz --- components/filter_image_resolution/Dockerfile | 3 +-- components/load_from_hf_hub/Dockerfile | 3 +-- .../components/download_images/fondant_component.yaml | 2 +- pyproject.toml | 1 - src/fondant/data_io.py | 4 ---- 5 files changed, 3 insertions(+), 10 deletions(-) diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile index 177c9bf15..5db8abca7 100644 --- a/components/filter_image_resolution/Dockerfile +++ b/components/filter_image_resolution/Dockerfile @@ -3,8 +3,7 @@ FROM --platform=linux/amd64 python:3.8-slim # System dependencies RUN apt-get update && \ apt-get upgrade -y && \ - apt-get install git -y && \ - apt-get install graphviz -y + apt-get install git -y # Install requirements COPY requirements.txt / diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 177c9bf15..5db8abca7 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -3,8 +3,7 @@ FROM --platform=linux/amd64 python:3.8-slim # System dependencies RUN apt-get update && \ apt-get upgrade -y && \ - apt-get install git -y && \ - apt-get install graphviz -y + apt-get install git -y # Install requirements COPY requirements.txt / diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 9f800040b..760d5b829 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:3c9ea91ff9221286f9a228c61c3cea44e5499a12 +image: ghcr.io/ml6team/download_images:06b316c830abe72d9d9a71f914c9c5fd205ec88b consumes: image: diff --git a/pyproject.toml b/pyproject.toml index b074eb7a2..7b837de9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,6 @@ dask = {extras = ["dataframe"], version = ">= 2023.4.1"} importlib-resources = { version = ">= 1.3", python = "<3.9" } jsonschema = ">= 4.18" pyarrow = ">= 11.0.0" -graphviz = ">= 0.20.1" fsspec = { version = ">= 2023.4.0", optional = true} gcsfs = { version = ">= 2023.4.0", optional = true } diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 27c9c4036..1f109a96c 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -206,10 +206,6 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: dataframe.index = dataframe.index.rename("id").astype("string") - # logging.info("Visualizing task graph...") - # TODO: doesn't work on GCP - # dataframe.visualize(filename=f'{self.manifest.base_path}/graph.png') - logger.info("Creating write tasks...") # Turn index into an empty dataframe so we can write it From 6e6bd6af759220fb27557d737d2bfef04b52953f Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 1 Aug 2023 15:29:51 +0200 Subject: [PATCH 35/65] More improvements --- .../load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 2 + .../components/download_images/Dockerfile | 2 +- .../download_images/fondant_component.yaml | 3 +- .../components/download_images/src/main.py | 44 ++++++++++++------- .../load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 1 + 7 files changed, 37 insertions(+), 19 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 05b65a56e..7a76c1f05 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:3c9ea91ff9221286f9a228c61c3cea44e5499a12 +image: ghcr.io/ml6team/load_from_hf_hub:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index a2103dbbf..1427c0954 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -71,6 +71,8 @@ def load(self) -> dd.DataFrame: dask_df["id"] = dask_df.id.cumsum() dask_df = dask_df.set_index("id", sort=True) + print("Length of the dataframe:", len(dask_df)) + return dask_df diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile index dcdb7bb29..787f89bb4 100644 --- a/examples/pipelines/datacomp/components/download_images/Dockerfile +++ b/examples/pipelines/datacomp/components/download_images/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=3c9ea91ff9221286f9a228c61c3cea44e5499a12 +ARG FONDANT_VERSION=7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 760d5b829..069e6888a 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:06b316c830abe72d9d9a71f914c9c5fd205ec88b +image: ghcr.io/ml6team/download_images:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 consumes: image: @@ -17,6 +17,7 @@ produces: type: int16 height: type: int16 + additionalFields: false args: timeout: diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index c6b89e392..bdc181fa9 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -11,8 +11,9 @@ import urllib import dask.dataframe as dd -from fondant.component import PandasTransformComponent -from fondant.executor import PandasTransformExecutor + +from fondant.component import DaskTransformComponent +from fondant.executor import DaskTransformExecutor from resizer import Resizer logger = logging.getLogger(__name__) @@ -79,6 +80,7 @@ def download_image_with_retry( img_stream = download_image( url, timeout, user_agent_token, disallowed_header_directives, ) + print("Img stream:", img_stream) if img_stream is not None: # resize the image img_str, width, height = resizer(img_stream) @@ -86,7 +88,7 @@ def download_image_with_retry( return None, None, None -class DownloadImagesComponent(PandasTransformComponent): +class DownloadImagesComponent(DaskTransformComponent): """Component that downloads images based on URLs.""" def __init__(self, @@ -123,28 +125,40 @@ def __init__(self, max_aspect_ratio=max_aspect_ratio, ) - def transform( - self, - dataframe: dd.DataFrame, - ) -> dd.DataFrame: - logger.info("Instantiating resizer...") + def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + logger.info("Downloading images...") + + print("Timeout:", self.timeout) + print("Retries:", self.retries) - dataframe[[("image", "data"), ("image", "width"), ("image", "height")]] = dataframe.apply( + print("Columns of dataframe:", dataframe.columns) + print("First rows of dataframe:", dataframe.head(5)) + + result = dataframe.apply( lambda example: download_image_with_retry( - url=example.image.url, + url=example.image_url, timeout=self.timeout, retries=self.retries, resizer=self.resizer, ), axis=1, + result_type="expand", + meta={0: bytes, 1: int, 2: int}, ) - # Remove images that could not be fetched - dataframe = dataframe.dropna() + result.columns = [ + "images_data", + "images_width", + "images_height", + ] + + print("Length of the result:", len(result)) + print("Columns of result:", result.columns) + print("First rows of result:", result.head()) - return dataframe + return result if __name__ == "__main__": - executor = PandasTransformExecutor.from_args() - executor.execute(DownloadImagesComponent) + executor = DaskTransformExecutor.from_args() + executor.execute(DownloadImagesComponent) \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 072ae0996..d7052ac07 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:3c9ea91ff9221286f9a228c61c3cea44e5499a12 +image: ghcr.io/ml6team/load_from_hf_hub:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index b43228df9..9e5f4b6ab 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -69,6 +69,7 @@ ) download_images_op = ComponentOp( component_dir="components/download_images", + node_pool_name="n2-standard-128-pool", ) # add ops to pipeline From 3a97346724235e607ffdc2e4981259c95132ccfe Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 1 Aug 2023 16:04:52 +0200 Subject: [PATCH 36/65] Debug --- components/load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 3 +++ .../components/download_images/fondant_component.yaml | 2 +- .../datacomp/components/download_images/src/main.py | 9 +++------ .../components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 7a76c1f05..9b658a2e9 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 +image: ghcr.io/ml6team/load_from_hf_hub:a366ee0d5a3902b618971bb0ede6817bbb7d18f3 produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 1427c0954..4ca9f6749 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -71,6 +71,9 @@ def load(self) -> dd.DataFrame: dask_df["id"] = dask_df.id.cumsum() dask_df = dask_df.set_index("id", sort=True) + # let's just load 10 rows for debugging + dask_df = dask_df.head(10) + dask_df = dd.from_pandas(dask_df, npartitions=1) print("Length of the dataframe:", len(dask_df)) return dask_df diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 069e6888a..a32a6dddf 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 +image: ghcr.io/ml6team/download_images:a366ee0d5a3902b618971bb0ede6817bbb7d18f3 consumes: image: diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index bdc181fa9..45b9fd615 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -130,19 +130,16 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: print("Timeout:", self.timeout) print("Retries:", self.retries) - - print("Columns of dataframe:", dataframe.columns) - print("First rows of dataframe:", dataframe.head(5)) - result = dataframe.apply( + result = dataframe.map_partitions( lambda example: download_image_with_retry( url=example.image_url, timeout=self.timeout, retries=self.retries, resizer=self.resizer, ), - axis=1, - result_type="expand", + # axis=1, + # result_type="expand", meta={0: bytes, 1: int, 2: int}, ) diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index d7052ac07..13bb38d5f 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 +image: ghcr.io/ml6team/load_from_hf_hub:a366ee0d5a3902b618971bb0ede6817bbb7d18f3 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 9e5f4b6ab..15475eef1 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) From d1882ec19d82657c2127ccbfbb14685266cc11b1 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 1 Aug 2023 16:15:55 +0200 Subject: [PATCH 37/65] Use Pandas component --- .../load_from_hf_hub/fondant_component.yaml | 2 +- components/load_from_hf_hub/src/main.py | 5 --- .../components/download_images/src/main.py | 36 +++++++++---------- .../load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 2 +- 5 files changed, 21 insertions(+), 26 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 9b658a2e9..94fa473ea 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:a366ee0d5a3902b618971bb0ede6817bbb7d18f3 +image: ghcr.io/ml6team/load_from_hf_hub:184080f269fe2f17e01d418fda221733003ac2b5 produces: dummy_variable: #TODO: fill in here diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py index 4ca9f6749..a2103dbbf 100644 --- a/components/load_from_hf_hub/src/main.py +++ b/components/load_from_hf_hub/src/main.py @@ -71,11 +71,6 @@ def load(self) -> dd.DataFrame: dask_df["id"] = dask_df.id.cumsum() dask_df = dask_df.set_index("id", sort=True) - # let's just load 10 rows for debugging - dask_df = dask_df.head(10) - dask_df = dd.from_pandas(dask_df, npartitions=1) - print("Length of the dataframe:", len(dask_df)) - return dask_df diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index 45b9fd615..249c1f8fb 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -10,10 +10,10 @@ import traceback import urllib -import dask.dataframe as dd +import pandas as pd -from fondant.component import DaskTransformComponent -from fondant.executor import DaskTransformExecutor +from fondant.component import PandasTransformComponent +from fondant.executor import PandasTransformExecutor from resizer import Resizer logger = logging.getLogger(__name__) @@ -88,7 +88,7 @@ def download_image_with_retry( return None, None, None -class DownloadImagesComponent(DaskTransformComponent): +class DownloadImagesComponent(PandasTransformComponent): """Component that downloads images based on URLs.""" def __init__(self, @@ -125,37 +125,37 @@ def __init__(self, max_aspect_ratio=max_aspect_ratio, ) - def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: logger.info("Downloading images...") print("Timeout:", self.timeout) print("Retries:", self.retries) - result = dataframe.map_partitions( + dataframe[[("image", "data"), ("image", "width"), ("images", "height")]] = dataframe["image"]["url"].apply( lambda example: download_image_with_retry( - url=example.image_url, + url=example, timeout=self.timeout, retries=self.retries, resizer=self.resizer, ), # axis=1, # result_type="expand", - meta={0: bytes, 1: int, 2: int}, + # meta={0: bytes, 1: int, 2: int}, ) - result.columns = [ - "images_data", - "images_width", - "images_height", - ] + # result.columns = [ + # "images_data", + # "images_width", + # "images_height", + # ] - print("Length of the result:", len(result)) - print("Columns of result:", result.columns) - print("First rows of result:", result.head()) + # print("Length of the result:", len(result)) + # print("Columns of result:", result.columns) + # print("First rows of result:", result.head()) - return result + return dataframe if __name__ == "__main__": - executor = DaskTransformExecutor.from_args() + executor = PandasTransformExecutor.from_args() executor.execute(DownloadImagesComponent) \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 13bb38d5f..ca83d6efd 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:a366ee0d5a3902b618971bb0ede6817bbb7d18f3 +image: ghcr.io/ml6team/load_from_hf_hub:184080f269fe2f17e01d418fda221733003ac2b5 produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 15475eef1..6f09d7f18 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,7 +39,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 10000, + "n_rows_to_load": 10, "dataset_length": 12800000, }, node_pool_name="n2-standard-128-pool", From f81f12256ce41ee69e7af37ac3673a26f4a1f8b9 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 1 Aug 2023 17:26:59 +0200 Subject: [PATCH 38/65] Run on 1000 images --- .../load_from_hf_hub/fondant_component.yaml | 2 +- .../download_images/fondant_component.yaml | 6 +-- .../components/download_images/src/main.py | 43 +++++++------------ .../load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 3 +- 5 files changed, 23 insertions(+), 33 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 94fa473ea..072794d95 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:184080f269fe2f17e01d418fda221733003ac2b5 +image: ghcr.io/ml6team/load_from_hf_hub:de73f003806455e248125d3b1ce19c1a52aea8ea produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index a32a6dddf..9844b2200 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:a366ee0d5a3902b618971bb0ede6817bbb7d18f3 +image: ghcr.io/ml6team/download_images:de73f003806455e248125d3b1ce19c1a52aea8ea consumes: image: @@ -14,9 +14,9 @@ produces: data: type: binary width: - type: int16 + type: int64 height: - type: int16 + type: int64 additionalFields: false args: diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index 249c1f8fb..0950e82e3 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -10,10 +10,10 @@ import traceback import urllib -import pandas as pd +import dask.dataframe as dd -from fondant.component import PandasTransformComponent -from fondant.executor import PandasTransformExecutor +from fondant.component import DaskTransformComponent +from fondant.executor import DaskTransformExecutor from resizer import Resizer logger = logging.getLogger(__name__) @@ -88,7 +88,7 @@ def download_image_with_retry( return None, None, None -class DownloadImagesComponent(PandasTransformComponent): +class DownloadImagesComponent(DaskTransformComponent): """Component that downloads images based on URLs.""" def __init__(self, @@ -125,37 +125,26 @@ def __init__(self, max_aspect_ratio=max_aspect_ratio, ) - def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: + def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: logger.info("Downloading images...") - - print("Timeout:", self.timeout) - print("Retries:", self.retries) - dataframe[[("image", "data"), ("image", "width"), ("images", "height")]] = dataframe["image"]["url"].apply( + result = dataframe.apply( lambda example: download_image_with_retry( - url=example, - timeout=self.timeout, - retries=self.retries, - resizer=self.resizer, + url=example.image_url, timeout=self.timeout, retries=self.retries, resizer=self.resizer, ), - # axis=1, - # result_type="expand", - # meta={0: bytes, 1: int, 2: int}, + axis=1, + result_type='expand', + meta={0: object, 1: int, 2: int}, ) + result.columns = ['image_data', 'image_width', 'image_height'] - # result.columns = [ - # "images_data", - # "images_width", - # "images_height", - # ] - - # print("Length of the result:", len(result)) - # print("Columns of result:", result.columns) - # print("First rows of result:", result.head()) + print("Length of the final dataframe:", len(dataframe)) + print("First few rows of final dataframe:") + print(result.head(5)) - return dataframe + return result if __name__ == "__main__": - executor = PandasTransformExecutor.from_args() + executor = DaskTransformExecutor.from_args() executor.execute(DownloadImagesComponent) \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index ca83d6efd..ecce89a91 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:184080f269fe2f17e01d418fda221733003ac2b5 +image: ghcr.io/ml6team/load_from_hf_hub:de73f003806455e248125d3b1ce19c1a52aea8ea produces: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 6f09d7f18..e70ce3b9c 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -39,7 +39,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 10, + "n_rows_to_load": 1000, "dataset_length": 12800000, }, node_pool_name="n2-standard-128-pool", @@ -70,6 +70,7 @@ download_images_op = ComponentOp( component_dir="components/download_images", node_pool_name="n2-standard-128-pool", + output_partition_size="disable", ) # add ops to pipeline From 0c711303bd00ed63712037bbae9dea1b89f6009c Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 2 Aug 2023 10:23:37 +0200 Subject: [PATCH 39/65] Use map_partitions --- .../download_images/fondant_component.yaml | 2 +- .../components/download_images/src/main.py | 48 +++++++++++++++---- examples/pipelines/datacomp/pipeline.py | 6 +-- 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 9844b2200..3e14e3958 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:de73f003806455e248125d3b1ce19c1a52aea8ea +image: ghcr.io/ml6team/download_images:aa4cf164c762cba00480c1335251d81a2c10fc44 consumes: image: diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index 0950e82e3..da554bc2c 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -11,6 +11,7 @@ import urllib import dask.dataframe as dd +import numpy as np from fondant.component import DaskTransformComponent from fondant.executor import DaskTransformExecutor @@ -88,6 +89,24 @@ def download_image_with_retry( return None, None, None +def download_image_with_retry_partition(df, timeout, retries, resizer): + # process a single partition + # TODO make column name more flexible + data = df.image_url.apply(lambda x: + download_image_with_retry( + url=x, timeout=timeout, retries=retries, resizer=resizer, + ), + ) + + # use assign to add values as extra columns + df = df.assign(data=[example[0] for example in data], + width=[example[1] for example in data], + height=[example[2] for example in data], + ) + + return df + + class DownloadImagesComponent(DaskTransformComponent): """Component that downloads images based on URLs.""" @@ -128,21 +147,30 @@ def __init__(self, def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: logger.info("Downloading images...") - result = dataframe.apply( - lambda example: download_image_with_retry( - url=example.image_url, timeout=self.timeout, retries=self.retries, resizer=self.resizer, - ), - axis=1, - result_type='expand', - meta={0: object, 1: int, 2: int}, + # create meta + # needs to be a dictionary with keys = column names, values = dtypes of columns + # for each column in the output + meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)} + meta["data"] = np.dtype(bytes) + meta["width"] = np.dtype(int) + meta["height"] = np.dtype(int) + + dataframe = dataframe.map_partitions( + download_image_with_retry_partition, + timeout=self.timeout, + retries=self.retries, + resizer=self.resizer, + meta=meta, ) - result.columns = ['image_data', 'image_width', 'image_height'] + + # rename new columns to be conform the spec + dataframe = dataframe.rename(columns={"data": "image_data", "width": "image_width", "height":"image_height"}) print("Length of the final dataframe:", len(dataframe)) print("First few rows of final dataframe:") - print(result.head(5)) + print(dataframe.head(5)) - return result + return dataframe if __name__ == "__main__": diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index e70ce3b9c..c34e77949 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -39,7 +39,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 1000, + "n_rows_to_load": 50000, "dataset_length": 12800000, }, node_pool_name="n2-standard-128-pool", From dd9d06d649a0048bad718c235948317c241cdc9a Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 2 Aug 2023 10:56:11 +0200 Subject: [PATCH 40/65] Add logging --- .../pipelines/datacomp/components/download_images/src/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index da554bc2c..93cb50ed2 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -145,6 +145,8 @@ def __init__(self, ) def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + + logger.info("Length of the dataframe:", len(dataframe)) logger.info("Downloading images...") # create meta From a940050d378774ded4226a8e0838a175d6f65712 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 2 Aug 2023 12:08:10 +0200 Subject: [PATCH 41/65] More improvements --- .../download_images/fondant_component.yaml | 17 +++++++++++++++-- examples/pipelines/datacomp/pipeline.py | 4 ++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 3e14e3958..651d1230a 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:aa4cf164c762cba00480c1335251d81a2c10fc44 +image: ghcr.io/ml6team/download_images:50c28a05f04587c8fc445ab8199cbf16fb32dcac consumes: image: @@ -11,13 +11,26 @@ consumes: produces: image: fields: + url: + type: string data: type: binary width: type: int64 height: type: int64 - additionalFields: false + face_bboxes: + type: array + items: + type: array + items: + type: float32 + sha256: + type: utf8 + embedding: + type: array + items: + type: float32 args: timeout: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index c34e77949..dfb8f8342 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) From 3ae578ad1bfa9278f4435d3007befc94c8292eb1 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 2 Aug 2023 12:20:32 +0200 Subject: [PATCH 42/65] More improvements --- components/load_from_hf_hub/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 072794d95..b03bb1c7a 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:de73f003806455e248125d3b1ce19c1a52aea8ea +image: ghcr.io/ml6team/load_from_hf_hub:50c28a05f04587c8fc445ab8199cbf16fb32dcac produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index dfb8f8342..c34e77949 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) From e490a2eb88c1799dcde17db7101f00ac32ddd1f3 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Wed, 2 Aug 2023 13:50:48 +0200 Subject: [PATCH 43/65] Fix rebase --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 5db8abca7..d5a4a0f18 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=f3f3925b8e8f634e2978e5c7fcefa72c53baba7c +ARG FONDANT_VERSION=e268128ab04bb8cfa030928d43efb0a3b77caad5 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index b03bb1c7a..537c304db 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:50c28a05f04587c8fc445ab8199cbf16fb32dcac +image: ghcr.io/ml6team/load_from_hf_hub:e268128ab04bb8cfa030928d43efb0a3b77caad5 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index ecce89a91..a74d0ecfd 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:de73f003806455e248125d3b1ce19c1a52aea8ea +image: ghcr.io/ml6team/load_from_hf_hub:e268128ab04bb8cfa030928d43efb0a3b77caad5 produces: image: From edc65f516067937401e325650d5b5409f071bc39 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 08:49:24 +0200 Subject: [PATCH 44/65] More improvements --- components/load_from_hf_hub/Dockerfile | 2 +- .../load_from_hf_hub/fondant_component.yaml | 2 +- .../download_images/fondant_component.yaml | 16 +--------------- .../components/download_images/src/main.py | 5 ++++- .../load_from_hf_hub/fondant_component.yaml | 2 +- 5 files changed, 8 insertions(+), 19 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index d5a4a0f18..30d994d98 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=e268128ab04bb8cfa030928d43efb0a3b77caad5 +ARG FONDANT_VERSION=5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 537c304db..47fb2de4d 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:e268128ab04bb8cfa030928d43efb0a3b77caad5 +image: ghcr.io/ml6team/load_from_hf_hub:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 651d1230a..c83cf87d7 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:50c28a05f04587c8fc445ab8199cbf16fb32dcac +image: ghcr.io/ml6team/download_images:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41 consumes: image: @@ -11,26 +11,12 @@ consumes: produces: image: fields: - url: - type: string data: type: binary width: type: int64 height: type: int64 - face_bboxes: - type: array - items: - type: array - items: - type: float32 - sha256: - type: utf8 - embedding: - type: array - items: - type: float32 args: timeout: diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index 93cb50ed2..377c9ccef 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -166,7 +166,10 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: ) # rename new columns to be conform the spec - dataframe = dataframe.rename(columns={"data": "image_data", "width": "image_width", "height":"image_height"}) + dataframe = dataframe.rename(columns={"data": "image_data", "width": "image_width", "height":"image_height"}) + + # Remove images that could not be fetched + dataframe = dataframe.dropna() print("Length of the final dataframe:", len(dataframe)) print("First few rows of final dataframe:") diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index a74d0ecfd..7a65e0bc0 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:e268128ab04bb8cfa030928d43efb0a3b77caad5 +image: ghcr.io/ml6team/load_from_hf_hub:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41 produces: image: From 7acbeba1796d23b8e055c62a7ebc98c2d339163b Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 08:52:57 +0200 Subject: [PATCH 45/65] Fix rebase --- components/load_from_hf_hub/Dockerfile | 2 +- components/load_from_hf_hub/fondant_component.yaml | 2 +- .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile index 30d994d98..7df92b7f6 100644 --- a/components/load_from_hf_hub/Dockerfile +++ b/components/load_from_hf_hub/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41 +ARG FONDANT_VERSION=edc65f516067937401e325650d5b5409f071bc39 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml index 47fb2de4d..f4baa6378 100644 --- a/components/load_from_hf_hub/fondant_component.yaml +++ b/components/load_from_hf_hub/fondant_component.yaml @@ -1,6 +1,6 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41 +image: ghcr.io/ml6team/load_from_hf_hub:edc65f516067937401e325650d5b5409f071bc39 produces: dummy_variable: #TODO: fill in here diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 7a65e0bc0..44d96a8ab 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -31,6 +31,8 @@ produces: image_text: fields: + uid: + type: string clip_b32_similarity_score: type: float32 clip_l14_similarity_score: From f225ef9e6b24c38795d841a98c9e0957c120eff8 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 09:55:40 +0200 Subject: [PATCH 46/65] Include uids --- .../components/download_images/fondant_component.yaml | 2 +- .../components/load_from_hf_hub/fondant_component.yaml | 6 +++--- examples/pipelines/datacomp/pipeline.py | 4 ++++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index c83cf87d7..5e846f2ae 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41 +image: ghcr.io/ml6team/download_images:edc65f516067937401e325650d5b5409f071bc39 consumes: image: diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index 44d96a8ab..dd48dd285 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -1,10 +1,12 @@ name: Load from hub description: Component that loads a dataset from the hub -image: ghcr.io/ml6team/load_from_hf_hub:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41 +image: ghcr.io/ml6team/load_from_hf_hub:edc65f516067937401e325650d5b5409f071bc39 produces: image: fields: + uid: + type: string url: type: string width: @@ -31,8 +33,6 @@ produces: image_text: fields: - uid: - type: string clip_b32_similarity_score: type: float32 clip_l14_similarity_score: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index c34e77949..7f7a27164 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -22,6 +22,7 @@ # define ops load_component_column_mapping = { + "uid": "image_uid", "url": "image_url", "original_width": "image_width", "original_height": "image_height", @@ -42,12 +43,14 @@ "n_rows_to_load": 50000, "dataset_length": 12800000, }, + node_pool_label="node_pool", node_pool_name="n2-standard-128-pool", # output_partition_size="10MB", ) filter_image_resolution_op = ComponentOp.from_registry( name="filter_image_resolution", arguments={"min_image_dim": 200, "max_aspect_ratio": 3}, + node_pool_label="node_pool", node_pool_name="n2-standard-128-pool", output_partition_size='disable', ) @@ -69,6 +72,7 @@ ) download_images_op = ComponentOp( component_dir="components/download_images", + node_pool_label="node_pool", node_pool_name="n2-standard-128-pool", output_partition_size="disable", ) From 1bfe2593b5058b663b608f21740425b482c30b5c Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 10:38:10 +0200 Subject: [PATCH 47/65] More improvements --- .../components/detect_text/Dockerfile | 23 +++ .../detect_text/fondant_component.yaml | 17 +++ .../components/detect_text/requirements.txt | 6 + .../components/detect_text/src/main.py | 141 ++++++++++++++++++ .../load_from_hf_hub/fondant_component.yaml | 4 +- examples/pipelines/datacomp/pipeline.py | 8 +- 6 files changed, 193 insertions(+), 6 deletions(-) create mode 100644 examples/pipelines/datacomp/components/detect_text/Dockerfile create mode 100644 examples/pipelines/datacomp/components/detect_text/fondant_component.yaml create mode 100644 examples/pipelines/datacomp/components/detect_text/requirements.txt create mode 100644 examples/pipelines/datacomp/components/detect_text/src/main.py diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile new file mode 100644 index 000000000..787f89bb4 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 python:3.8-slim + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml new file mode 100644 index 000000000..2f1026e53 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -0,0 +1,17 @@ +name: Detect text +description: Component that detects text in images +image: ghcr.io/ml6team/detext_text:edc65f516067937401e325650d5b5409f071bc39 + +consumes: + image: + fields: + data: + type: bytes + +produces: + image: + fields: + data: + type: binary + detected_boxes: + type: int64 \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text/requirements.txt b/examples/pipelines/datacomp/components/detect_text/requirements.txt new file mode 100644 index 000000000..10913ea37 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text/requirements.txt @@ -0,0 +1,6 @@ +huggingface-hub==0.16.4 +easyocr==1.7.0 +onnxruntime==1.15.1 +onnxruntime-gpu==1.15.1 +Pillow==10.0.0 +torch==2.0.1 \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py new file mode 100644 index 000000000..c10a193e0 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -0,0 +1,141 @@ +"""This component detexts text in images, using CRAFT. +""" +import logging + +import dask.dataframe as dd +import numpy as np +import io +from PIL import Image +import pandas as pd + +from huggingface_hub import hf_hub_download + +from easyocr.craft_utils import getDetBoxes, adjustResultCoordinates +from easyocr.imgproc import normalizeMeanVariance +from easyocr.utils import group_text_box + +import torch +import onnxruntime as ort + +from fondant.component import DaskTransformComponent +from fondant.executor import DaskTransformExecutor + +logger = logging.getLogger(__name__) + + +def resize_aspect_ratio_pillow(img, square_size, mag_ratio=1): + height, width, channel = img.shape + + # magnify image size + target_size = mag_ratio * max(height, width) + + # set original image size + if target_size > square_size: + target_size = square_size + + ratio = target_size / max(height, width) + + target_h, target_w = int(height * ratio), int(width * ratio) + img = Image.fromarray(img) + proc = img.resize((target_w, target_h), resample = Image.BILINEAR) + + # make canvas and paste image + target_h32, target_w32 = target_h, target_w + if target_h % 32 != 0: + target_h32 = target_h + (32 - target_h % 32) + if target_w % 32 != 0: + target_w32 = target_w + (32 - target_w % 32) + resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32) + resized[0:target_h, 0:target_w, :] = proc + target_h, target_w = target_h32, target_w32 + + size_heatmap = (int(target_w/2), int(target_h/2)) + + return resized, ratio, size_heatmap + + +def get_boxes(image_data, session): + try: + image = Image.open(io.BytesIO(image_data)).convert("RGB") + image = np.array(image) + except: + return [None] + + # Use Pillow instead of cv2 + img_resized, target_ratio, size_heatmap = resize_aspect_ratio_pillow(img=image, + square_size=512, + mag_ratio=1.0) + + ratio_h = ratio_w = 1 / target_ratio + x = normalizeMeanVariance(img_resized) + x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0) + + input_name = session.get_inputs()[0].name + + # Prepare input tensor for inference + inp = {input_name: x.numpy()} + + # Run inference and get output + y, _ = session.run(None, inp) + + # Extract score and link maps + score_text = y[0, :, :, 0] + score_link = y[0, :, :, 1] + + # Post-processing to obtain bounding boxes and polygons + boxes, _, _ = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4) + boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h) + + # Create horizontal reading list + polys = [] + for box in boxes: + poly = np.array(box).astype(np.int32).reshape((-1)) + polys.append(poly) + + horizontal_list, _ = group_text_box(polys) + + return horizontal_list + + +def get_boxes_dataframe(df, session): + # process a single partition + # TODO make column name more flexible + df["image_detected_boxes"] = df.image_data.apply(lambda x: + get_boxes( + image_data=x, session=session, + ), + ) + + return df + + +class DetextTextComponent(DaskTransformComponent): + """Component that detexts text in images, using the CRAFT model. + """ + + def __init__(self, *args) -> None: + + craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model") + providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider'] + self.session = ort.InferenceSession(craft_onnx, providers=providers) + + def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + + # create meta + # needs to be a dictionary with keys = column names, values = dtypes of columns + # for each column in the output + meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)} + meta["image_detected_boxes"] = np.dtype(object) + + dataframe = dataframe.map_partitions( + get_boxes_dataframe, + session=self.session, + meta=meta, + ) + + return dataframe + + +if __name__ == "__main__": + executor = DaskTransformExecutor.from_args() + executor.execute(DetextTextComponent) \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml index dd48dd285..01e5994ff 100644 --- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml @@ -5,8 +5,6 @@ image: ghcr.io/ml6team/load_from_hf_hub:edc65f516067937401e325650d5b5409f071bc39 produces: image: fields: - uid: - type: string url: type: string width: @@ -33,6 +31,8 @@ produces: image_text: fields: + uid: + type: string clip_b32_similarity_score: type: float32 clip_l14_similarity_score: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 7f7a27164..db4fcccbe 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,14 +15,13 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) # define ops load_component_column_mapping = { - "uid": "image_uid", "url": "image_url", "original_width": "image_width", "original_height": "image_height", @@ -30,6 +29,7 @@ "sha256": "image_sha256", "clip_l14_embedding": "image_embedding", "text": "text_data", + "uid": "image_text_uid", "clip_b32_similarity_score": "image_text_clip_b32_similarity_score", "clip_l14_similarity_score": "image_text_clip_l14_similarity_score", } @@ -40,7 +40,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 50000, + "n_rows_to_load": 10, "dataset_length": 12800000, }, node_pool_label="node_pool", From 025399a9663800c25214ee718ef4b4661a5e00df Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 11:31:41 +0200 Subject: [PATCH 48/65] More improvements --- .../datacomp/components/detect_text/Dockerfile | 2 +- .../components/detect_text/fondant_component.yaml | 10 +++++++--- .../datacomp/components/detect_text/src/main.py | 3 ++- examples/pipelines/datacomp/pipeline.py | 12 ++++++++++-- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile index 787f89bb4..1430c6972 100644 --- a/examples/pipelines/datacomp/components/detect_text/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 +ARG FONDANT_VERSION=1bfe2593b5058b663b608f21740425b482c30b5c RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 2f1026e53..49d6688c5 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,12 +1,12 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detext_text:edc65f516067937401e325650d5b5409f071bc39 +image: ghcr.io/ml6team/detect_text:1bfe2593b5058b663b608f21740425b482c30b5c consumes: image: fields: data: - type: bytes + type: binary produces: image: @@ -14,4 +14,8 @@ produces: data: type: binary detected_boxes: - type: int64 \ No newline at end of file + type: array + items: + type: array + items: + type: int64 \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index c10a193e0..ca4f23fc1 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -6,7 +6,6 @@ import numpy as np import io from PIL import Image -import pandas as pd from huggingface_hub import hf_hub_download @@ -116,6 +115,7 @@ class DetextTextComponent(DaskTransformComponent): def __init__(self, *args) -> None: craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model") + logger.info("Device:" ort.get_device()) providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider'] self.session = ort.InferenceSession(craft_onnx, providers=providers) @@ -127,6 +127,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)} meta["image_detected_boxes"] = np.dtype(object) + logger.info("Detecting texts..") dataframe = dataframe.map_partitions( get_boxes_dataframe, session=self.session, diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index db4fcccbe..9beafe25a 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -76,12 +76,20 @@ node_pool_name="n2-standard-128-pool", output_partition_size="disable", ) +detect_text_op = ComponentOp( + component_dir="components/detect_text", + node_pool_label="node_pool", + node_pool_name="n2-standard-128-pool", + output_partition_size="disable", +) + # add ops to pipeline pipeline.add_op(load_from_hub_op) # pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) pipeline.add_op(download_images_op, dependencies=load_from_hub_op) +pipeline.add_op(detect_text_op, dependencies=download_images_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From 0c9ac7f63812be43c3cffc30396ef4a71cd0578c Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 11:58:09 +0200 Subject: [PATCH 49/65] More improvements --- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- .../pipelines/datacomp/components/detect_text/src/main.py | 2 +- examples/pipelines/datacomp/pipeline.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 49d6688c5..9f5931812 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:1bfe2593b5058b663b608f21740425b482c30b5c +image: ghcr.io/ml6team/detect_text:025399a9663800c25214ee718ef4b4661a5e00df consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index ca4f23fc1..1acc0d162 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -115,7 +115,7 @@ class DetextTextComponent(DaskTransformComponent): def __init__(self, *args) -> None: craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model") - logger.info("Device:" ort.get_device()) + logger.info("Device:", ort.get_device()) providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider'] self.session = ort.InferenceSession(craft_onnx, providers=providers) diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 9beafe25a..0384cde31 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -78,8 +78,9 @@ ) detect_text_op = ComponentOp( component_dir="components/detect_text", - node_pool_label="node_pool", - node_pool_name="n2-standard-128-pool", + number_of_gpus=1, + node_pool_label="node_pool", + node_pool_name="model-inference-pool", output_partition_size="disable", ) From c179edf9bc4b95c0624750a0f51d5c82d5ee68a2 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 13:30:11 +0200 Subject: [PATCH 50/65] More improvements --- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- .../pipelines/datacomp/components/detect_text/src/main.py | 2 ++ .../datacomp/components/download_images/src/main.py | 6 ++---- examples/pipelines/datacomp/pipeline.py | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 9f5931812..befdf1dc3 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:025399a9663800c25214ee718ef4b4661a5e00df +image: ghcr.io/ml6team/detect_text:0c9ac7f63812be43c3cffc30396ef4a71cd0578c consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index 1acc0d162..15a2dca87 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -121,6 +121,8 @@ def __init__(self, *args) -> None: def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + logger.info(f"Length of the dataframe: {len(dataframe)}") + # create meta # needs to be a dictionary with keys = column names, values = dtypes of columns # for each column in the output diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index 377c9ccef..10932daa5 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -146,7 +146,7 @@ def __init__(self, def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: - logger.info("Length of the dataframe:", len(dataframe)) + logger.info(f"Length of the dataframe: {len(dataframe)}") logger.info("Downloading images...") # create meta @@ -171,9 +171,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: # Remove images that could not be fetched dataframe = dataframe.dropna() - print("Length of the final dataframe:", len(dataframe)) - print("First few rows of final dataframe:") - print(dataframe.head(5)) + logger.info(f"Length of the final dataframe: {len(dataframe)}") return dataframe diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 0384cde31..1eb5c8a3d 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -90,7 +90,7 @@ # pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) pipeline.add_op(download_images_op, dependencies=load_from_hub_op) -pipeline.add_op(detect_text_op, dependencies=download_images_op) +# pipeline.add_op(detect_text_op, dependencies=download_images_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From 78660256b14287b586b374afcd129ea23f538cc6 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 14:45:02 +0200 Subject: [PATCH 51/65] More improvements --- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- .../datacomp/components/detect_text/requirements.txt | 1 - .../pipelines/datacomp/components/detect_text/src/main.py | 5 ++++- .../components/download_images/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 4 ++-- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index befdf1dc3..f647bf08f 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:0c9ac7f63812be43c3cffc30396ef4a71cd0578c +image: ghcr.io/ml6team/detect_text:c179edf9bc4b95c0624750a0f51d5c82d5ee68a2 consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/requirements.txt b/examples/pipelines/datacomp/components/detect_text/requirements.txt index 10913ea37..16a68c71b 100644 --- a/examples/pipelines/datacomp/components/detect_text/requirements.txt +++ b/examples/pipelines/datacomp/components/detect_text/requirements.txt @@ -1,6 +1,5 @@ huggingface-hub==0.16.4 easyocr==1.7.0 -onnxruntime==1.15.1 onnxruntime-gpu==1.15.1 Pillow==10.0.0 torch==2.0.1 \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index 15a2dca87..72c6d771c 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -115,7 +115,7 @@ class DetextTextComponent(DaskTransformComponent): def __init__(self, *args) -> None: craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model") - logger.info("Device:", ort.get_device()) + logger.info(f"Device: {ort.get_device()}") providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider'] self.session = ort.InferenceSession(craft_onnx, providers=providers) @@ -136,6 +136,9 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: meta=meta, ) + logger.info(f"Length of the final dataframe: {len(dataframe)}") + print("First rows of final dataframe:", dataframe.head()) + return dataframe diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 5e846f2ae..b756099c8 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:edc65f516067937401e325650d5b5409f071bc39 +image: ghcr.io/ml6team/download_images:c179edf9bc4b95c0624750a0f51d5c82d5ee68a2 consumes: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 1eb5c8a3d..666656403 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) From 22de5f60ae89d6f6efc56a6d1002e147cab75fd2 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 15:13:31 +0200 Subject: [PATCH 52/65] Use cpu for now --- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- .../pipelines/datacomp/components/detect_text/src/main.py | 1 + examples/pipelines/datacomp/pipeline.py | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index f647bf08f..db667c29f 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:c179edf9bc4b95c0624750a0f51d5c82d5ee68a2 +image: ghcr.io/ml6team/detect_text:78660256b14287b586b374afcd129ea23f538cc6 consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index 72c6d771c..f4a27c562 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -117,6 +117,7 @@ def __init__(self, *args) -> None: craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model") logger.info(f"Device: {ort.get_device()}") providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider'] + providers = ['CPUExecutionProvider'] self.session = ort.InferenceSession(craft_onnx, providers=providers) def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 666656403..138ff986f 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -90,7 +90,7 @@ # pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) pipeline.add_op(download_images_op, dependencies=load_from_hub_op) -# pipeline.add_op(detect_text_op, dependencies=download_images_op) +pipeline.add_op(detect_text_op, dependencies=download_images_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From dcc714e12374928e893b2fd6865b151923f0a791 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 16:39:03 +0200 Subject: [PATCH 53/65] More improvements --- .../datacomp/components/detect_text/fondant_component.yaml | 4 ++-- .../pipelines/datacomp/components/detect_text/src/main.py | 5 +++-- .../components/download_images/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 6 +++--- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index db667c29f..46e9f8252 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:78660256b14287b586b374afcd129ea23f538cc6 +image: ghcr.io/ml6team/detect_text:22de5f60ae89d6f6efc56a6d1002e147cab75fd2 consumes: image: @@ -13,7 +13,7 @@ produces: fields: data: type: binary - detected_boxes: + boxes: type: array items: type: array diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index f4a27c562..8d0d8fe0b 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -99,7 +99,7 @@ def get_boxes(image_data, session): def get_boxes_dataframe(df, session): # process a single partition # TODO make column name more flexible - df["image_detected_boxes"] = df.image_data.apply(lambda x: + df["image_boxes"] = df.image_data.apply(lambda x: get_boxes( image_data=x, session=session, ), @@ -128,7 +128,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: # needs to be a dictionary with keys = column names, values = dtypes of columns # for each column in the output meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)} - meta["image_detected_boxes"] = np.dtype(object) + meta["image_boxes"] = np.dtype(object) logger.info("Detecting texts..") dataframe = dataframe.map_partitions( @@ -138,6 +138,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: ) logger.info(f"Length of the final dataframe: {len(dataframe)}") + print("Columns of the final dataframe", dataframe.columns) print("First rows of final dataframe:", dataframe.head()) return dataframe diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index b756099c8..620984e9e 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:c179edf9bc4b95c0624750a0f51d5c82d5ee68a2 +image: ghcr.io/ml6team/download_images:22de5f60ae89d6f6efc56a6d1002e147cab75fd2 consumes: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 138ff986f..dfe6f9f4f 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -40,7 +40,7 @@ "dataset_name": "nielsr/datacomp-small-with-embeddings", "image_column_names": [], "column_name_mapping": load_component_column_mapping, - "n_rows_to_load": 10, + "n_rows_to_load": 1000, "dataset_length": 12800000, }, node_pool_label="node_pool", From fa341a58a463c825f90198a42adbe40ce1cc600a Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 16:58:59 +0200 Subject: [PATCH 54/65] Run text detection on 1000 images --- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 46e9f8252..54aaffd6b 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:22de5f60ae89d6f6efc56a6d1002e147cab75fd2 +image: ghcr.io/ml6team/detect_text:dcc714e12374928e893b2fd6865b151923f0a791 consumes: image: From dada0106b4a39ad53025704541231d350ee262a2 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 17:49:26 +0200 Subject: [PATCH 55/65] Remove print statement --- examples/pipelines/datacomp/components/detect_text/Dockerfile | 4 ++-- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- .../pipelines/datacomp/components/detect_text/src/main.py | 1 - src/fondant/data_io.py | 3 --- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile index 1430c6972..d6059a33a 100644 --- a/examples/pipelines/datacomp/components/detect_text/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.8-slim +FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime # System dependencies RUN apt-get update && \ @@ -6,7 +6,7 @@ RUN apt-get update && \ apt-get install git -y # Install requirements -COPY requirements.txt / +COPY requirements.txt ./ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 54aaffd6b..520bda66a 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:dcc714e12374928e893b2fd6865b151923f0a791 +image: ghcr.io/ml6team/detect_text:fa341a58a463c825f90198a42adbe40ce1cc600a consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index 8d0d8fe0b..e4e0a4d48 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -117,7 +117,6 @@ def __init__(self, *args) -> None: craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model") logger.info(f"Device: {ort.get_device()}") providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider'] - providers = ['CPUExecutionProvider'] self.session = ort.InferenceSession(craft_onnx, providers=providers) def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 1f109a96c..4fa442747 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -93,9 +93,6 @@ def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame: subset_df = dd.read_parquet(remote_path, columns=fields) - logger.info(f"First few rows of subset {subset_name}:") - print(subset_df.head()) - # add subset prefix to columns subset_df = subset_df.rename( columns={col: subset_name + "_" + col for col in subset_df.columns}, From 7ec067b0d610d9658ba2dffe0ac33475926840bb Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Thu, 3 Aug 2023 20:02:41 +0200 Subject: [PATCH 56/65] More improvements --- examples/pipelines/datacomp/components/detect_text/Dockerfile | 2 +- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile index d6059a33a..e37002456 100644 --- a/examples/pipelines/datacomp/components/detect_text/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=1bfe2593b5058b663b608f21740425b482c30b5c +ARG FONDANT_VERSION=dada0106b4a39ad53025704541231d350ee262a2 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 520bda66a..1caae76a2 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:fa341a58a463c825f90198a42adbe40ce1cc600a +image: ghcr.io/ml6team/detect_text:dada0106b4a39ad53025704541231d350ee262a2 consumes: image: From ac2a130dd070d538d6f9a16855494261a012dded Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Fri, 4 Aug 2023 10:24:55 +0200 Subject: [PATCH 57/65] More improvements --- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- .../pipelines/datacomp/components/detect_text/src/main.py | 6 ------ .../datacomp/components/download_images/Dockerfile | 2 +- .../components/download_images/fondant_component.yaml | 2 +- scripts/build_components.sh | 1 + 5 files changed, 4 insertions(+), 9 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 1caae76a2..aafc45523 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:dada0106b4a39ad53025704541231d350ee262a2 +image: ghcr.io/ml6team/detect_text:7ec067b0d610d9658ba2dffe0ac33475926840bb consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index e4e0a4d48..ecb7e7655 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -121,8 +121,6 @@ def __init__(self, *args) -> None: def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: - logger.info(f"Length of the dataframe: {len(dataframe)}") - # create meta # needs to be a dictionary with keys = column names, values = dtypes of columns # for each column in the output @@ -136,10 +134,6 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: meta=meta, ) - logger.info(f"Length of the final dataframe: {len(dataframe)}") - print("Columns of the final dataframe", dataframe.columns) - print("First rows of final dataframe:", dataframe.head()) - return dataframe diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile index 787f89bb4..c5c2a7767 100644 --- a/examples/pipelines/datacomp/components/download_images/Dockerfile +++ b/examples/pipelines/datacomp/components/download_images/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=7db2865958ff18a3aec6aafbb3374c2a70a6b8f5 +ARG FONDANT_VERSION=7ec067b0d610d9658ba2dffe0ac33475926840bb RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 620984e9e..e988ebe46 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:22de5f60ae89d6f6efc56a6d1002e147cab75fd2 +image: ghcr.io/ml6team/download_images:7ec067b0d610d9658ba2dffe0ac33475926840bb consumes: image: diff --git a/scripts/build_components.sh b/scripts/build_components.sh index 265d08b83..178acba99 100755 --- a/scripts/build_components.sh +++ b/scripts/build_components.sh @@ -97,6 +97,7 @@ for dir in "${components_to_build[@]}"; do docker build --push "${args[@]}" \ --build-arg="FONDANT_VERSION=${tags[0]}" \ --label org.opencontainers.image.source=https://github.com/${namespace}/${repo} \ + --platform=linux/amd64 \ . popd From 3d433d9e8dfeba967236445657dd8f415726de9a Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Fri, 4 Aug 2023 10:40:55 +0200 Subject: [PATCH 58/65] More improvements --- examples/pipelines/datacomp/components/detect_text/Dockerfile | 2 +- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- examples/pipelines/datacomp/components/detect_text/src/main.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile index e37002456..b52d9a008 100644 --- a/examples/pipelines/datacomp/components/detect_text/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime +FROM --platform=linux/amd64 python:3.8-slim # System dependencies RUN apt-get update && \ diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index aafc45523..bf82cf094 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:7ec067b0d610d9658ba2dffe0ac33475926840bb +image: ghcr.io/ml6team/detect_text:ac2a130dd070d538d6f9a16855494261a012dded consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index ecb7e7655..bfcf79529 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -117,6 +117,7 @@ def __init__(self, *args) -> None: craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model") logger.info(f"Device: {ort.get_device()}") providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider'] + providers = ['CPUExecutionProvider'] self.session = ort.InferenceSession(craft_onnx, providers=providers) def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: From cc418db0964ee202ebf2d4fd3aaee15604604244 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Fri, 4 Aug 2023 14:59:11 +0200 Subject: [PATCH 59/65] Simplify requirements --- .../detect_text/fondant_component.yaml | 2 +- .../components/detect_text/requirements.txt | 8 +- .../detect_text/src/easyocr_utils.py | 366 ++++++++++++++++++ .../components/detect_text/src/main.py | 6 +- .../components/download_images/Dockerfile | 2 +- .../download_images/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 6 +- 7 files changed, 378 insertions(+), 14 deletions(-) create mode 100644 examples/pipelines/datacomp/components/detect_text/src/easyocr_utils.py diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index bf82cf094..7a6579aa0 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:ac2a130dd070d538d6f9a16855494261a012dded +image: ghcr.io/ml6team/detect_text:3d433d9e8dfeba967236445657dd8f415726de9a consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/requirements.txt b/examples/pipelines/datacomp/components/detect_text/requirements.txt index 16a68c71b..62fc9dd06 100644 --- a/examples/pipelines/datacomp/components/detect_text/requirements.txt +++ b/examples/pipelines/datacomp/components/detect_text/requirements.txt @@ -1,5 +1,5 @@ huggingface-hub==0.16.4 -easyocr==1.7.0 -onnxruntime-gpu==1.15.1 -Pillow==10.0.0 -torch==2.0.1 \ No newline at end of file +onnxruntime==1.15.1 +torch==2.0.1 +opencv-python-headless +scipy \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text/src/easyocr_utils.py b/examples/pipelines/datacomp/components/detect_text/src/easyocr_utils.py new file mode 100644 index 000000000..4feb67768 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text/src/easyocr_utils.py @@ -0,0 +1,366 @@ +""" +Copyright (c) 2019-present NAVER Corp. +MIT License +""" +import numpy as np +import cv2 +import math +from scipy.ndimage import label + +""" auxiliary functions """ +# unwarp corodinates +def warpCoord(Minv, pt): + out = np.matmul(Minv, (pt[0], pt[1], 1)) + return np.array([out[0]/out[2], out[1]/out[2]]) +""" end of auxiliary functions """ + + +def getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text, estimate_num_chars=False): + # prepare data + linkmap = linkmap.copy() + textmap = textmap.copy() + img_h, img_w = textmap.shape + + """ labeling method """ + ret, text_score = cv2.threshold(textmap, low_text, 1, 0) + ret, link_score = cv2.threshold(linkmap, link_threshold, 1, 0) + + text_score_comb = np.clip(text_score + link_score, 0, 1) + nLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(text_score_comb.astype(np.uint8), connectivity=4) + + det = [] + mapper = [] + for k in range(1,nLabels): + # size filtering + size = stats[k, cv2.CC_STAT_AREA] + if size < 10: continue + + # thresholding + if np.max(textmap[labels==k]) < text_threshold: continue + + # make segmentation map + segmap = np.zeros(textmap.shape, dtype=np.uint8) + segmap[labels==k] = 255 + if estimate_num_chars: + _, character_locs = cv2.threshold((textmap - linkmap) * segmap /255., text_threshold, 1, 0) + _, n_chars = label(character_locs) + mapper.append(n_chars) + else: + mapper.append(k) + segmap[np.logical_and(link_score==1, text_score==0)] = 0 # remove link area + x, y = stats[k, cv2.CC_STAT_LEFT], stats[k, cv2.CC_STAT_TOP] + w, h = stats[k, cv2.CC_STAT_WIDTH], stats[k, cv2.CC_STAT_HEIGHT] + niter = int(math.sqrt(size * min(w, h) / (w * h)) * 2) + sx, ex, sy, ey = x - niter, x + w + niter + 1, y - niter, y + h + niter + 1 + # boundary check + if sx < 0 : sx = 0 + if sy < 0 : sy = 0 + if ex >= img_w: ex = img_w + if ey >= img_h: ey = img_h + kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(1 + niter, 1 + niter)) + segmap[sy:ey, sx:ex] = cv2.dilate(segmap[sy:ey, sx:ex], kernel) + + # make box + np_contours = np.roll(np.array(np.where(segmap!=0)),1,axis=0).transpose().reshape(-1,2) + rectangle = cv2.minAreaRect(np_contours) + box = cv2.boxPoints(rectangle) + + # align diamond-shape + w, h = np.linalg.norm(box[0] - box[1]), np.linalg.norm(box[1] - box[2]) + box_ratio = max(w, h) / (min(w, h) + 1e-5) + if abs(1 - box_ratio) <= 0.1: + l, r = min(np_contours[:,0]), max(np_contours[:,0]) + t, b = min(np_contours[:,1]), max(np_contours[:,1]) + box = np.array([[l, t], [r, t], [r, b], [l, b]], dtype=np.float32) + + # make clock-wise order + startidx = box.sum(axis=1).argmin() + box = np.roll(box, 4-startidx, 0) + box = np.array(box) + + det.append(box) + + return det, labels, mapper + +def getPoly_core(boxes, labels, mapper, linkmap): + # configs + num_cp = 5 + max_len_ratio = 0.7 + expand_ratio = 1.45 + max_r = 2.0 + step_r = 0.2 + + polys = [] + for k, box in enumerate(boxes): + # size filter for small instance + w, h = int(np.linalg.norm(box[0] - box[1]) + 1), int(np.linalg.norm(box[1] - box[2]) + 1) + if w < 10 or h < 10: + polys.append(None); continue + + # warp image + tar = np.float32([[0,0],[w,0],[w,h],[0,h]]) + M = cv2.getPerspectiveTransform(box, tar) + word_label = cv2.warpPerspective(labels, M, (w, h), flags=cv2.INTER_NEAREST) + try: + Minv = np.linalg.inv(M) + except: + polys.append(None); continue + + # binarization for selected label + cur_label = mapper[k] + word_label[word_label != cur_label] = 0 + word_label[word_label > 0] = 1 + + """ Polygon generation """ + # find top/bottom contours + cp = [] + max_len = -1 + for i in range(w): + region = np.where(word_label[:,i] != 0)[0] + if len(region) < 2 : continue + cp.append((i, region[0], region[-1])) + length = region[-1] - region[0] + 1 + if length > max_len: max_len = length + + # pass if max_len is similar to h + if h * max_len_ratio < max_len: + polys.append(None); continue + + # get pivot points with fixed length + tot_seg = num_cp * 2 + 1 + seg_w = w / tot_seg # segment width + pp = [None] * num_cp # init pivot points + cp_section = [[0, 0]] * tot_seg + seg_height = [0] * num_cp + seg_num = 0 + num_sec = 0 + prev_h = -1 + for i in range(0,len(cp)): + (x, sy, ey) = cp[i] + if (seg_num + 1) * seg_w <= x and seg_num <= tot_seg: + # average previous segment + if num_sec == 0: break + cp_section[seg_num] = [cp_section[seg_num][0] / num_sec, cp_section[seg_num][1] / num_sec] + num_sec = 0 + + # reset variables + seg_num += 1 + prev_h = -1 + + # accumulate center points + cy = (sy + ey) * 0.5 + cur_h = ey - sy + 1 + cp_section[seg_num] = [cp_section[seg_num][0] + x, cp_section[seg_num][1] + cy] + num_sec += 1 + + if seg_num % 2 == 0: continue # No polygon area + + if prev_h < cur_h: + pp[int((seg_num - 1)/2)] = (x, cy) + seg_height[int((seg_num - 1)/2)] = cur_h + prev_h = cur_h + + # processing last segment + if num_sec != 0: + cp_section[-1] = [cp_section[-1][0] / num_sec, cp_section[-1][1] / num_sec] + + # pass if num of pivots is not sufficient or segment width is smaller than character height + if None in pp or seg_w < np.max(seg_height) * 0.25: + polys.append(None); continue + + # calc median maximum of pivot points + half_char_h = np.median(seg_height) * expand_ratio / 2 + + # calc gradiant and apply to make horizontal pivots + new_pp = [] + for i, (x, cy) in enumerate(pp): + dx = cp_section[i * 2 + 2][0] - cp_section[i * 2][0] + dy = cp_section[i * 2 + 2][1] - cp_section[i * 2][1] + if dx == 0: # gradient if zero + new_pp.append([x, cy - half_char_h, x, cy + half_char_h]) + continue + rad = - math.atan2(dy, dx) + c, s = half_char_h * math.cos(rad), half_char_h * math.sin(rad) + new_pp.append([x - s, cy - c, x + s, cy + c]) + + # get edge points to cover character heatmaps + isSppFound, isEppFound = False, False + grad_s = (pp[1][1] - pp[0][1]) / (pp[1][0] - pp[0][0]) + (pp[2][1] - pp[1][1]) / (pp[2][0] - pp[1][0]) + grad_e = (pp[-2][1] - pp[-1][1]) / (pp[-2][0] - pp[-1][0]) + (pp[-3][1] - pp[-2][1]) / (pp[-3][0] - pp[-2][0]) + for r in np.arange(0.5, max_r, step_r): + dx = 2 * half_char_h * r + if not isSppFound: + line_img = np.zeros(word_label.shape, dtype=np.uint8) + dy = grad_s * dx + p = np.array(new_pp[0]) - np.array([dx, dy, dx, dy]) + cv2.line(line_img, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), 1, thickness=1) + if np.sum(np.logical_and(word_label, line_img)) == 0 or r + 2 * step_r >= max_r: + spp = p + isSppFound = True + if not isEppFound: + line_img = np.zeros(word_label.shape, dtype=np.uint8) + dy = grad_e * dx + p = np.array(new_pp[-1]) + np.array([dx, dy, dx, dy]) + cv2.line(line_img, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), 1, thickness=1) + if np.sum(np.logical_and(word_label, line_img)) == 0 or r + 2 * step_r >= max_r: + epp = p + isEppFound = True + if isSppFound and isEppFound: + break + + # pass if boundary of polygon is not found + if not (isSppFound and isEppFound): + polys.append(None); continue + + # make final polygon + poly = [] + poly.append(warpCoord(Minv, (spp[0], spp[1]))) + for p in new_pp: + poly.append(warpCoord(Minv, (p[0], p[1]))) + poly.append(warpCoord(Minv, (epp[0], epp[1]))) + poly.append(warpCoord(Minv, (epp[2], epp[3]))) + for p in reversed(new_pp): + poly.append(warpCoord(Minv, (p[2], p[3]))) + poly.append(warpCoord(Minv, (spp[2], spp[3]))) + + # add to final result + polys.append(np.array(poly)) + + return polys + +def getDetBoxes(textmap, linkmap, text_threshold, link_threshold, low_text, poly=False, estimate_num_chars=False): + if poly and estimate_num_chars: + raise Exception("Estimating the number of characters not currently supported with poly.") + boxes, labels, mapper = getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text, estimate_num_chars) + + if poly: + polys = getPoly_core(boxes, labels, mapper, linkmap) + else: + polys = [None] * len(boxes) + + return boxes, polys, mapper + +def adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net = 2): + if len(polys) > 0: + polys = np.array(polys) + for k in range(len(polys)): + if polys[k] is not None: + polys[k] *= (ratio_w * ratio_net, ratio_h * ratio_net) + return polys + + +def normalizeMeanVariance(in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)): + # should be RGB order + img = in_img.copy().astype(np.float32) + + img -= np.array([mean[0] * 255.0, mean[1] * 255.0, mean[2] * 255.0], dtype=np.float32) + img /= np.array([variance[0] * 255.0, variance[1] * 255.0, variance[2] * 255.0], dtype=np.float32) + return img + + +def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, width_ths = 1.0, add_margin = 0.05, sort_output = True): + # poly top-left, top-right, low-right, low-left + horizontal_list, free_list,combined_list, merged_list = [],[],[],[] + + for poly in polys: + slope_up = (poly[3]-poly[1])/np.maximum(10, (poly[2]-poly[0])) + slope_down = (poly[5]-poly[7])/np.maximum(10, (poly[4]-poly[6])) + if max(abs(slope_up), abs(slope_down)) < slope_ths: + x_max = max([poly[0],poly[2],poly[4],poly[6]]) + x_min = min([poly[0],poly[2],poly[4],poly[6]]) + y_max = max([poly[1],poly[3],poly[5],poly[7]]) + y_min = min([poly[1],poly[3],poly[5],poly[7]]) + horizontal_list.append([x_min, x_max, y_min, y_max, 0.5*(y_min+y_max), y_max-y_min]) + else: + height = np.linalg.norm([poly[6]-poly[0],poly[7]-poly[1]]) + width = np.linalg.norm([poly[2]-poly[0],poly[3]-poly[1]]) + + margin = int(1.44*add_margin*min(width, height)) + + theta13 = abs(np.arctan( (poly[1]-poly[5])/np.maximum(10, (poly[0]-poly[4])))) + theta24 = abs(np.arctan( (poly[3]-poly[7])/np.maximum(10, (poly[2]-poly[6])))) + # do I need to clip minimum, maximum value here? + x1 = poly[0] - np.cos(theta13)*margin + y1 = poly[1] - np.sin(theta13)*margin + x2 = poly[2] + np.cos(theta24)*margin + y2 = poly[3] - np.sin(theta24)*margin + x3 = poly[4] + np.cos(theta13)*margin + y3 = poly[5] + np.sin(theta13)*margin + x4 = poly[6] - np.cos(theta24)*margin + y4 = poly[7] + np.sin(theta24)*margin + + free_list.append([[x1,y1],[x2,y2],[x3,y3],[x4,y4]]) + if sort_output: + horizontal_list = sorted(horizontal_list, key=lambda item: item[4]) + + # combine box + new_box = [] + for poly in horizontal_list: + + if len(new_box) == 0: + b_height = [poly[5]] + b_ycenter = [poly[4]] + new_box.append(poly) + else: + # comparable height and comparable y_center level up to ths*height + if abs(np.mean(b_ycenter) - poly[4]) < ycenter_ths*np.mean(b_height): + b_height.append(poly[5]) + b_ycenter.append(poly[4]) + new_box.append(poly) + else: + b_height = [poly[5]] + b_ycenter = [poly[4]] + combined_list.append(new_box) + new_box = [poly] + combined_list.append(new_box) + + # merge list use sort again + for boxes in combined_list: + if len(boxes) == 1: # one box per line + box = boxes[0] + margin = int(add_margin*min(box[1]-box[0],box[5])) + merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin]) + else: # multiple boxes per line + boxes = sorted(boxes, key=lambda item: item[0]) + + merged_box, new_box = [],[] + for box in boxes: + if len(new_box) == 0: + b_height = [box[5]] + x_max = box[1] + new_box.append(box) + else: + if (abs(np.mean(b_height) - box[5]) < height_ths*np.mean(b_height)) and ((box[0]-x_max) < width_ths *(box[3]-box[2])): # merge boxes + b_height.append(box[5]) + x_max = box[1] + new_box.append(box) + else: + b_height = [box[5]] + x_max = box[1] + merged_box.append(new_box) + new_box = [box] + if len(new_box) >0: merged_box.append(new_box) + + for mbox in merged_box: + if len(mbox) != 1: # adjacent box in same line + # do I need to add margin here? + x_min = min(mbox, key=lambda x: x[0])[0] + x_max = max(mbox, key=lambda x: x[1])[1] + y_min = min(mbox, key=lambda x: x[2])[2] + y_max = max(mbox, key=lambda x: x[3])[3] + + box_width = x_max - x_min + box_height = y_max - y_min + margin = int(add_margin * (min(box_width, box_height))) + + merged_list.append([x_min-margin, x_max+margin, y_min-margin, y_max+margin]) + else: # non adjacent box in same line + box = mbox[0] + + box_width = box[1] - box[0] + box_height = box[3] - box[2] + margin = int(add_margin * (min(box_width, box_height))) + + merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin]) + # may need to check if box is really in image + return merged_list, free_list \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index bfcf79529..a440b9c66 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -9,9 +9,7 @@ from huggingface_hub import hf_hub_download -from easyocr.craft_utils import getDetBoxes, adjustResultCoordinates -from easyocr.imgproc import normalizeMeanVariance -from easyocr.utils import group_text_box +from easyocr_utils import getDetBoxes, adjustResultCoordinates, normalizeMeanVariance, group_text_box import torch import onnxruntime as ort @@ -58,7 +56,7 @@ def get_boxes(image_data, session): image = Image.open(io.BytesIO(image_data)).convert("RGB") image = np.array(image) except: - return [None] + return [] # Use Pillow instead of cv2 img_resized, target_ratio, size_heatmap = resize_aspect_ratio_pillow(img=image, diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile index c5c2a7767..5ff146228 100644 --- a/examples/pipelines/datacomp/components/download_images/Dockerfile +++ b/examples/pipelines/datacomp/components/download_images/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=7ec067b0d610d9658ba2dffe0ac33475926840bb +ARG FONDANT_VERSION=3d433d9e8dfeba967236445657dd8f415726de9a RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index e988ebe46..9f4e2b3df 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:7ec067b0d610d9658ba2dffe0ac33475926840bb +image: ghcr.io/ml6team/download_images:3d433d9e8dfeba967236445657dd8f415726de9a consumes: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index dfe6f9f4f..b2b609961 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - base_path=PipelineConfigs.BASE_PATH, - # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + # base_path=PipelineConfigs.BASE_PATH, + base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -90,7 +90,7 @@ # pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) pipeline.add_op(download_images_op, dependencies=load_from_hub_op) -pipeline.add_op(detect_text_op, dependencies=download_images_op) +# pipeline.add_op(detect_text_op, dependencies=download_images_op) # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) # TODO add more ops From fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Sun, 6 Aug 2023 09:52:39 +0200 Subject: [PATCH 60/65] More improvements --- .../components/detect_text/Dockerfile | 3 +- .../detect_text/fondant_component.yaml | 2 +- .../components/detect_text/requirements.txt | 1 - .../components/detect_text/src/main.py | 2 +- .../components/detect_text_gpu/Dockerfile | 23 ++ .../detect_text_gpu/fondant_component.yaml | 21 + .../detect_text_gpu/requirements.txt | 4 + .../detect_text_gpu/src/easyocr_utils.py | 366 ++++++++++++++++++ .../components/detect_text_gpu/src/main.py | 140 +++++++ .../detect_text_torch_gpu/Dockerfile | 23 ++ .../fondant_component.yaml | 21 + .../detect_text_torch_gpu/requirements.txt | 4 + .../detect_text_torch_gpu/src/main.py | 142 +++++++ .../download_images/fondant_component.yaml | 2 +- .../components/download_images/src/main.py | 1 + .../datacomp/components/dummy/Dockerfile | 23 ++ .../components/dummy/fondant_component.yaml | 9 + .../components/dummy/requirements.txt | 0 .../datacomp/components/dummy/src/main.py | 26 ++ examples/pipelines/datacomp/pipeline.py | 31 +- 20 files changed, 826 insertions(+), 18 deletions(-) create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/Dockerfile create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/fondant_component.yaml create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/requirements.txt create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/src/easyocr_utils.py create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/src/main.py create mode 100644 examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile create mode 100644 examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml create mode 100644 examples/pipelines/datacomp/components/detect_text_torch_gpu/requirements.txt create mode 100644 examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py create mode 100644 examples/pipelines/datacomp/components/dummy/Dockerfile create mode 100644 examples/pipelines/datacomp/components/dummy/fondant_component.yaml create mode 100644 examples/pipelines/datacomp/components/dummy/requirements.txt create mode 100644 examples/pipelines/datacomp/components/dummy/src/main.py diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile index b52d9a008..ad09d730e 100644 --- a/examples/pipelines/datacomp/components/detect_text/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile @@ -8,10 +8,11 @@ RUN apt-get update && \ # Install requirements COPY requirements.txt ./ RUN pip3 install --no-cache-dir -r requirements.txt +RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=dada0106b4a39ad53025704541231d350ee262a2 +ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 7a6579aa0..357feb55f 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:3d433d9e8dfeba967236445657dd8f415726de9a +image: ghcr.io/ml6team/detect_text:cc418db0964ee202ebf2d4fd3aaee15604604244 consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text/requirements.txt b/examples/pipelines/datacomp/components/detect_text/requirements.txt index 62fc9dd06..823417f9a 100644 --- a/examples/pipelines/datacomp/components/detect_text/requirements.txt +++ b/examples/pipelines/datacomp/components/detect_text/requirements.txt @@ -1,5 +1,4 @@ huggingface-hub==0.16.4 onnxruntime==1.15.1 -torch==2.0.1 opencv-python-headless scipy \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py index a440b9c66..709017ffb 100644 --- a/examples/pipelines/datacomp/components/detect_text/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text/src/main.py @@ -34,7 +34,7 @@ def resize_aspect_ratio_pillow(img, square_size, mag_ratio=1): target_h, target_w = int(height * ratio), int(width * ratio) img = Image.fromarray(img) - proc = img.resize((target_w, target_h), resample = Image.BILINEAR) + proc = img.resize((target_w, target_h), resample = Image.Resampling.BILINEAR) # make canvas and paste image target_h32, target_w32 = target_h, target_w diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_gpu/Dockerfile new file mode 100644 index 000000000..ed1861562 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_gpu/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install requirements +COPY requirements.txt ./ +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244 +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_gpu/fondant_component.yaml new file mode 100644 index 000000000..9dfb4d69a --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_gpu/fondant_component.yaml @@ -0,0 +1,21 @@ +name: Detect text +description: Component that detects text in images +image: ghcr.io/ml6team/detect_text_gpu:cc418db0964ee202ebf2d4fd3aaee15604604244 + +consumes: + image: + fields: + data: + type: binary + +produces: + image: + fields: + data: + type: binary + boxes: + type: array + items: + type: array + items: + type: int64 \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/requirements.txt b/examples/pipelines/datacomp/components/detect_text_gpu/requirements.txt new file mode 100644 index 000000000..ee7989a30 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_gpu/requirements.txt @@ -0,0 +1,4 @@ +huggingface-hub==0.16.4 +onnxruntime-gpu==1.15.1 +opencv-python-headless +scipy \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/src/easyocr_utils.py b/examples/pipelines/datacomp/components/detect_text_gpu/src/easyocr_utils.py new file mode 100644 index 000000000..4feb67768 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_gpu/src/easyocr_utils.py @@ -0,0 +1,366 @@ +""" +Copyright (c) 2019-present NAVER Corp. +MIT License +""" +import numpy as np +import cv2 +import math +from scipy.ndimage import label + +""" auxiliary functions """ +# unwarp corodinates +def warpCoord(Minv, pt): + out = np.matmul(Minv, (pt[0], pt[1], 1)) + return np.array([out[0]/out[2], out[1]/out[2]]) +""" end of auxiliary functions """ + + +def getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text, estimate_num_chars=False): + # prepare data + linkmap = linkmap.copy() + textmap = textmap.copy() + img_h, img_w = textmap.shape + + """ labeling method """ + ret, text_score = cv2.threshold(textmap, low_text, 1, 0) + ret, link_score = cv2.threshold(linkmap, link_threshold, 1, 0) + + text_score_comb = np.clip(text_score + link_score, 0, 1) + nLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(text_score_comb.astype(np.uint8), connectivity=4) + + det = [] + mapper = [] + for k in range(1,nLabels): + # size filtering + size = stats[k, cv2.CC_STAT_AREA] + if size < 10: continue + + # thresholding + if np.max(textmap[labels==k]) < text_threshold: continue + + # make segmentation map + segmap = np.zeros(textmap.shape, dtype=np.uint8) + segmap[labels==k] = 255 + if estimate_num_chars: + _, character_locs = cv2.threshold((textmap - linkmap) * segmap /255., text_threshold, 1, 0) + _, n_chars = label(character_locs) + mapper.append(n_chars) + else: + mapper.append(k) + segmap[np.logical_and(link_score==1, text_score==0)] = 0 # remove link area + x, y = stats[k, cv2.CC_STAT_LEFT], stats[k, cv2.CC_STAT_TOP] + w, h = stats[k, cv2.CC_STAT_WIDTH], stats[k, cv2.CC_STAT_HEIGHT] + niter = int(math.sqrt(size * min(w, h) / (w * h)) * 2) + sx, ex, sy, ey = x - niter, x + w + niter + 1, y - niter, y + h + niter + 1 + # boundary check + if sx < 0 : sx = 0 + if sy < 0 : sy = 0 + if ex >= img_w: ex = img_w + if ey >= img_h: ey = img_h + kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(1 + niter, 1 + niter)) + segmap[sy:ey, sx:ex] = cv2.dilate(segmap[sy:ey, sx:ex], kernel) + + # make box + np_contours = np.roll(np.array(np.where(segmap!=0)),1,axis=0).transpose().reshape(-1,2) + rectangle = cv2.minAreaRect(np_contours) + box = cv2.boxPoints(rectangle) + + # align diamond-shape + w, h = np.linalg.norm(box[0] - box[1]), np.linalg.norm(box[1] - box[2]) + box_ratio = max(w, h) / (min(w, h) + 1e-5) + if abs(1 - box_ratio) <= 0.1: + l, r = min(np_contours[:,0]), max(np_contours[:,0]) + t, b = min(np_contours[:,1]), max(np_contours[:,1]) + box = np.array([[l, t], [r, t], [r, b], [l, b]], dtype=np.float32) + + # make clock-wise order + startidx = box.sum(axis=1).argmin() + box = np.roll(box, 4-startidx, 0) + box = np.array(box) + + det.append(box) + + return det, labels, mapper + +def getPoly_core(boxes, labels, mapper, linkmap): + # configs + num_cp = 5 + max_len_ratio = 0.7 + expand_ratio = 1.45 + max_r = 2.0 + step_r = 0.2 + + polys = [] + for k, box in enumerate(boxes): + # size filter for small instance + w, h = int(np.linalg.norm(box[0] - box[1]) + 1), int(np.linalg.norm(box[1] - box[2]) + 1) + if w < 10 or h < 10: + polys.append(None); continue + + # warp image + tar = np.float32([[0,0],[w,0],[w,h],[0,h]]) + M = cv2.getPerspectiveTransform(box, tar) + word_label = cv2.warpPerspective(labels, M, (w, h), flags=cv2.INTER_NEAREST) + try: + Minv = np.linalg.inv(M) + except: + polys.append(None); continue + + # binarization for selected label + cur_label = mapper[k] + word_label[word_label != cur_label] = 0 + word_label[word_label > 0] = 1 + + """ Polygon generation """ + # find top/bottom contours + cp = [] + max_len = -1 + for i in range(w): + region = np.where(word_label[:,i] != 0)[0] + if len(region) < 2 : continue + cp.append((i, region[0], region[-1])) + length = region[-1] - region[0] + 1 + if length > max_len: max_len = length + + # pass if max_len is similar to h + if h * max_len_ratio < max_len: + polys.append(None); continue + + # get pivot points with fixed length + tot_seg = num_cp * 2 + 1 + seg_w = w / tot_seg # segment width + pp = [None] * num_cp # init pivot points + cp_section = [[0, 0]] * tot_seg + seg_height = [0] * num_cp + seg_num = 0 + num_sec = 0 + prev_h = -1 + for i in range(0,len(cp)): + (x, sy, ey) = cp[i] + if (seg_num + 1) * seg_w <= x and seg_num <= tot_seg: + # average previous segment + if num_sec == 0: break + cp_section[seg_num] = [cp_section[seg_num][0] / num_sec, cp_section[seg_num][1] / num_sec] + num_sec = 0 + + # reset variables + seg_num += 1 + prev_h = -1 + + # accumulate center points + cy = (sy + ey) * 0.5 + cur_h = ey - sy + 1 + cp_section[seg_num] = [cp_section[seg_num][0] + x, cp_section[seg_num][1] + cy] + num_sec += 1 + + if seg_num % 2 == 0: continue # No polygon area + + if prev_h < cur_h: + pp[int((seg_num - 1)/2)] = (x, cy) + seg_height[int((seg_num - 1)/2)] = cur_h + prev_h = cur_h + + # processing last segment + if num_sec != 0: + cp_section[-1] = [cp_section[-1][0] / num_sec, cp_section[-1][1] / num_sec] + + # pass if num of pivots is not sufficient or segment width is smaller than character height + if None in pp or seg_w < np.max(seg_height) * 0.25: + polys.append(None); continue + + # calc median maximum of pivot points + half_char_h = np.median(seg_height) * expand_ratio / 2 + + # calc gradiant and apply to make horizontal pivots + new_pp = [] + for i, (x, cy) in enumerate(pp): + dx = cp_section[i * 2 + 2][0] - cp_section[i * 2][0] + dy = cp_section[i * 2 + 2][1] - cp_section[i * 2][1] + if dx == 0: # gradient if zero + new_pp.append([x, cy - half_char_h, x, cy + half_char_h]) + continue + rad = - math.atan2(dy, dx) + c, s = half_char_h * math.cos(rad), half_char_h * math.sin(rad) + new_pp.append([x - s, cy - c, x + s, cy + c]) + + # get edge points to cover character heatmaps + isSppFound, isEppFound = False, False + grad_s = (pp[1][1] - pp[0][1]) / (pp[1][0] - pp[0][0]) + (pp[2][1] - pp[1][1]) / (pp[2][0] - pp[1][0]) + grad_e = (pp[-2][1] - pp[-1][1]) / (pp[-2][0] - pp[-1][0]) + (pp[-3][1] - pp[-2][1]) / (pp[-3][0] - pp[-2][0]) + for r in np.arange(0.5, max_r, step_r): + dx = 2 * half_char_h * r + if not isSppFound: + line_img = np.zeros(word_label.shape, dtype=np.uint8) + dy = grad_s * dx + p = np.array(new_pp[0]) - np.array([dx, dy, dx, dy]) + cv2.line(line_img, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), 1, thickness=1) + if np.sum(np.logical_and(word_label, line_img)) == 0 or r + 2 * step_r >= max_r: + spp = p + isSppFound = True + if not isEppFound: + line_img = np.zeros(word_label.shape, dtype=np.uint8) + dy = grad_e * dx + p = np.array(new_pp[-1]) + np.array([dx, dy, dx, dy]) + cv2.line(line_img, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), 1, thickness=1) + if np.sum(np.logical_and(word_label, line_img)) == 0 or r + 2 * step_r >= max_r: + epp = p + isEppFound = True + if isSppFound and isEppFound: + break + + # pass if boundary of polygon is not found + if not (isSppFound and isEppFound): + polys.append(None); continue + + # make final polygon + poly = [] + poly.append(warpCoord(Minv, (spp[0], spp[1]))) + for p in new_pp: + poly.append(warpCoord(Minv, (p[0], p[1]))) + poly.append(warpCoord(Minv, (epp[0], epp[1]))) + poly.append(warpCoord(Minv, (epp[2], epp[3]))) + for p in reversed(new_pp): + poly.append(warpCoord(Minv, (p[2], p[3]))) + poly.append(warpCoord(Minv, (spp[2], spp[3]))) + + # add to final result + polys.append(np.array(poly)) + + return polys + +def getDetBoxes(textmap, linkmap, text_threshold, link_threshold, low_text, poly=False, estimate_num_chars=False): + if poly and estimate_num_chars: + raise Exception("Estimating the number of characters not currently supported with poly.") + boxes, labels, mapper = getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text, estimate_num_chars) + + if poly: + polys = getPoly_core(boxes, labels, mapper, linkmap) + else: + polys = [None] * len(boxes) + + return boxes, polys, mapper + +def adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net = 2): + if len(polys) > 0: + polys = np.array(polys) + for k in range(len(polys)): + if polys[k] is not None: + polys[k] *= (ratio_w * ratio_net, ratio_h * ratio_net) + return polys + + +def normalizeMeanVariance(in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)): + # should be RGB order + img = in_img.copy().astype(np.float32) + + img -= np.array([mean[0] * 255.0, mean[1] * 255.0, mean[2] * 255.0], dtype=np.float32) + img /= np.array([variance[0] * 255.0, variance[1] * 255.0, variance[2] * 255.0], dtype=np.float32) + return img + + +def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, width_ths = 1.0, add_margin = 0.05, sort_output = True): + # poly top-left, top-right, low-right, low-left + horizontal_list, free_list,combined_list, merged_list = [],[],[],[] + + for poly in polys: + slope_up = (poly[3]-poly[1])/np.maximum(10, (poly[2]-poly[0])) + slope_down = (poly[5]-poly[7])/np.maximum(10, (poly[4]-poly[6])) + if max(abs(slope_up), abs(slope_down)) < slope_ths: + x_max = max([poly[0],poly[2],poly[4],poly[6]]) + x_min = min([poly[0],poly[2],poly[4],poly[6]]) + y_max = max([poly[1],poly[3],poly[5],poly[7]]) + y_min = min([poly[1],poly[3],poly[5],poly[7]]) + horizontal_list.append([x_min, x_max, y_min, y_max, 0.5*(y_min+y_max), y_max-y_min]) + else: + height = np.linalg.norm([poly[6]-poly[0],poly[7]-poly[1]]) + width = np.linalg.norm([poly[2]-poly[0],poly[3]-poly[1]]) + + margin = int(1.44*add_margin*min(width, height)) + + theta13 = abs(np.arctan( (poly[1]-poly[5])/np.maximum(10, (poly[0]-poly[4])))) + theta24 = abs(np.arctan( (poly[3]-poly[7])/np.maximum(10, (poly[2]-poly[6])))) + # do I need to clip minimum, maximum value here? + x1 = poly[0] - np.cos(theta13)*margin + y1 = poly[1] - np.sin(theta13)*margin + x2 = poly[2] + np.cos(theta24)*margin + y2 = poly[3] - np.sin(theta24)*margin + x3 = poly[4] + np.cos(theta13)*margin + y3 = poly[5] + np.sin(theta13)*margin + x4 = poly[6] - np.cos(theta24)*margin + y4 = poly[7] + np.sin(theta24)*margin + + free_list.append([[x1,y1],[x2,y2],[x3,y3],[x4,y4]]) + if sort_output: + horizontal_list = sorted(horizontal_list, key=lambda item: item[4]) + + # combine box + new_box = [] + for poly in horizontal_list: + + if len(new_box) == 0: + b_height = [poly[5]] + b_ycenter = [poly[4]] + new_box.append(poly) + else: + # comparable height and comparable y_center level up to ths*height + if abs(np.mean(b_ycenter) - poly[4]) < ycenter_ths*np.mean(b_height): + b_height.append(poly[5]) + b_ycenter.append(poly[4]) + new_box.append(poly) + else: + b_height = [poly[5]] + b_ycenter = [poly[4]] + combined_list.append(new_box) + new_box = [poly] + combined_list.append(new_box) + + # merge list use sort again + for boxes in combined_list: + if len(boxes) == 1: # one box per line + box = boxes[0] + margin = int(add_margin*min(box[1]-box[0],box[5])) + merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin]) + else: # multiple boxes per line + boxes = sorted(boxes, key=lambda item: item[0]) + + merged_box, new_box = [],[] + for box in boxes: + if len(new_box) == 0: + b_height = [box[5]] + x_max = box[1] + new_box.append(box) + else: + if (abs(np.mean(b_height) - box[5]) < height_ths*np.mean(b_height)) and ((box[0]-x_max) < width_ths *(box[3]-box[2])): # merge boxes + b_height.append(box[5]) + x_max = box[1] + new_box.append(box) + else: + b_height = [box[5]] + x_max = box[1] + merged_box.append(new_box) + new_box = [box] + if len(new_box) >0: merged_box.append(new_box) + + for mbox in merged_box: + if len(mbox) != 1: # adjacent box in same line + # do I need to add margin here? + x_min = min(mbox, key=lambda x: x[0])[0] + x_max = max(mbox, key=lambda x: x[1])[1] + y_min = min(mbox, key=lambda x: x[2])[2] + y_max = max(mbox, key=lambda x: x[3])[3] + + box_width = x_max - x_min + box_height = y_max - y_min + margin = int(add_margin * (min(box_width, box_height))) + + merged_list.append([x_min-margin, x_max+margin, y_min-margin, y_max+margin]) + else: # non adjacent box in same line + box = mbox[0] + + box_width = box[1] - box[0] + box_height = box[3] - box[2] + margin = int(add_margin * (min(box_width, box_height))) + + merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin]) + # may need to check if box is really in image + return merged_list, free_list \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/src/main.py b/examples/pipelines/datacomp/components/detect_text_gpu/src/main.py new file mode 100644 index 000000000..79519c21e --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_gpu/src/main.py @@ -0,0 +1,140 @@ +"""This component detexts text in images, using CRAFT. +""" +import logging + +import dask.dataframe as dd +import numpy as np +import io +from PIL import Image + +from huggingface_hub import hf_hub_download + +from easyocr_utils import getDetBoxes, adjustResultCoordinates, normalizeMeanVariance, group_text_box + +import torch +import onnxruntime as ort + +from fondant.component import DaskTransformComponent +from fondant.executor import DaskTransformExecutor + +logger = logging.getLogger(__name__) + + +def resize_aspect_ratio_pillow(img, square_size, mag_ratio=1): + height, width, channel = img.shape + + # magnify image size + target_size = mag_ratio * max(height, width) + + # set original image size + if target_size > square_size: + target_size = square_size + + ratio = target_size / max(height, width) + + target_h, target_w = int(height * ratio), int(width * ratio) + img = Image.fromarray(img) + proc = img.resize((target_w, target_h), resample = Image.Resampling.BILINEAR) + + # make canvas and paste image + target_h32, target_w32 = target_h, target_w + if target_h % 32 != 0: + target_h32 = target_h + (32 - target_h % 32) + if target_w % 32 != 0: + target_w32 = target_w + (32 - target_w % 32) + resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32) + resized[0:target_h, 0:target_w, :] = proc + target_h, target_w = target_h32, target_w32 + + size_heatmap = (int(target_w/2), int(target_h/2)) + + return resized, ratio, size_heatmap + + +def get_boxes(image_data, session): + try: + image = Image.open(io.BytesIO(image_data)).convert("RGB") + image = np.array(image) + except: + return [] + + # Use Pillow instead of cv2 + img_resized, target_ratio, size_heatmap = resize_aspect_ratio_pillow(img=image, + square_size=512, + mag_ratio=1.0) + + ratio_h = ratio_w = 1 / target_ratio + x = normalizeMeanVariance(img_resized) + x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0) + + input_name = session.get_inputs()[0].name + + # Prepare input tensor for inference + inp = {input_name: x.numpy()} + + # Run inference and get output + y, _ = session.run(None, inp) + + # Extract score and link maps + score_text = y[0, :, :, 0] + score_link = y[0, :, :, 1] + + # Post-processing to obtain bounding boxes and polygons + boxes, _, _ = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4) + boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h) + + # Create horizontal reading list + polys = [] + for box in boxes: + poly = np.array(box).astype(np.int32).reshape((-1)) + polys.append(poly) + + horizontal_list, _ = group_text_box(polys) + + return horizontal_list + + +def get_boxes_dataframe(df, session): + # process a single partition + # TODO make column name more flexible + df["image_boxes"] = df.image_data.apply(lambda x: + get_boxes( + image_data=x, session=session, + ), + ) + + return df + + +class DetextTextComponent(DaskTransformComponent): + """Component that detexts text in images, using the CRAFT model. + """ + + def __init__(self, *args) -> None: + + craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model") + logger.info(f"Device: {ort.get_device()}") + providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider'] + self.session = ort.InferenceSession(craft_onnx, providers=providers) + + def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + + # create meta + # needs to be a dictionary with keys = column names, values = dtypes of columns + # for each column in the output + meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)} + meta["image_boxes"] = np.dtype(object) + + logger.info("Detecting texts..") + dataframe = dataframe.map_partitions( + get_boxes_dataframe, + session=self.session, + meta=meta, + ) + + return dataframe + + +if __name__ == "__main__": + executor = DaskTransformExecutor.from_args() + executor.execute(DetextTextComponent) \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile new file mode 100644 index 000000000..ed1861562 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install requirements +COPY requirements.txt ./ +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244 +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml new file mode 100644 index 000000000..fda4b0843 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml @@ -0,0 +1,21 @@ +name: Detect text +description: Component that detects text in images +image: ghcr.io/ml6team/detect_text_torch_gpu:cc418db0964ee202ebf2d4fd3aaee15604604244 + +consumes: + image: + fields: + data: + type: binary + +produces: + image: + fields: + data: + type: binary + boxes: + type: array + items: + type: array + items: + type: int64 \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/requirements.txt b/examples/pipelines/datacomp/components/detect_text_torch_gpu/requirements.txt new file mode 100644 index 000000000..e6bf68322 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/requirements.txt @@ -0,0 +1,4 @@ +huggingface-hub==0.16.4 +opencv-python-headless +scipy +easyocr==1.7.0 \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py new file mode 100644 index 000000000..c67e0d275 --- /dev/null +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py @@ -0,0 +1,142 @@ +"""This component detexts text in images, using CRAFT. +""" +import logging + +import dask.dataframe as dd +import numpy as np +import io +from PIL import Image + +from huggingface_hub import hf_hub_download + +from easyocr.craft_utils import getDetBoxes, adjustResultCoordinates +from easyocr.detection import get_detector +from easyocr.imgproc import normalizeMeanVariance +from easyocr.utils import group_text_box + +import torch + +from fondant.component import DaskTransformComponent +from fondant.executor import DaskTransformExecutor + +logger = logging.getLogger(__name__) + + +def resize_aspect_ratio_pillow(img, square_size, mag_ratio=1): + height, width, channel = img.shape + + # magnify image size + target_size = mag_ratio * max(height, width) + + # set original image size + if target_size > square_size: + target_size = square_size + + ratio = target_size / max(height, width) + + target_h, target_w = int(height * ratio), int(width * ratio) + img = Image.fromarray(img) + proc = img.resize((target_w, target_h), resample = Image.Resampling.BILINEAR) + + # make canvas and paste image + target_h32, target_w32 = target_h, target_w + if target_h % 32 != 0: + target_h32 = target_h + (32 - target_h % 32) + if target_w % 32 != 0: + target_w32 = target_w + (32 - target_w % 32) + resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32) + resized[0:target_h, 0:target_w, :] = proc + target_h, target_w = target_h32, target_w32 + + size_heatmap = (int(target_w/2), int(target_h/2)) + + return resized, ratio, size_heatmap + + +def get_boxes(image_data, net): + try: + image = Image.open(io.BytesIO(image_data)).convert("RGB") + image = np.array(image) + except: + return [] + + # Use Pillow instead of cv2 + img_resized, target_ratio, size_heatmap = resize_aspect_ratio_pillow(img=image, + square_size=512, + mag_ratio=1.0) + + ratio_h = ratio_w = 1 / target_ratio + x = normalizeMeanVariance(img_resized) + x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0) + + # Run inference and get output + x = x.to(net.device) + + # forward pass + with torch.no_grad(): + y, feature = net(x) + + # Extract score and link maps + score_text = y[0, :, :, 0].numpy() + score_link = y[0, :, :, 1].numpy() + + # Post-processing to obtain bounding boxes and polygons + boxes, _, _ = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4) + boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h) + + # Create horizontal reading list + polys = [] + for box in boxes: + poly = np.array(box).astype(np.int32).reshape((-1)) + polys.append(poly) + + horizontal_list, _ = group_text_box(polys) + + return horizontal_list + + +def get_boxes_dataframe(df, net): + # process a single partition + # TODO make column name more flexible + df["image_boxes"] = df.image_data.apply(lambda x: + get_boxes( + image_data=x, net=net, + ), + ) + + return df + + +class DetextTextComponent(DaskTransformComponent): + """Component that detexts text in images, using the CRAFT model. + """ + + def __init__(self, *args) -> None: + + filepath = hf_hub_download(repo_id="nielsr/craft-pytorch", filename="net.pth", repo_type="model") + device = "cuda" if torch.cuda.is_available() else "cpu" + logger.info(f"Device: {device}") + self.net = get_detector(filepath, device=device) + + def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + + # create meta + # needs to be a dictionary with keys = column names, values = dtypes of columns + # for each column in the output + meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)} + meta["image_data"] = bytes + meta["image_boxes"] = np.dtype(object) + + logger.info("Detecting texts..") + dataframe = dataframe.map_partitions( + get_boxes_dataframe, + net=self.net, + meta=meta, + ) + + return dataframe + + +if __name__ == "__main__": + executor = DaskTransformExecutor.from_args() + executor.execute(DetextTextComponent) \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 9f4e2b3df..5e5965144 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:3d433d9e8dfeba967236445657dd8f415726de9a +image: ghcr.io/ml6team/download_images:cc418db0964ee202ebf2d4fd3aaee15604604244 consumes: image: diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index 10932daa5..a7492f4c2 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -172,6 +172,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: dataframe = dataframe.dropna() logger.info(f"Length of the final dataframe: {len(dataframe)}") + print("Length of final dataframe:", len(dataframe)) return dataframe diff --git a/examples/pipelines/datacomp/components/dummy/Dockerfile b/examples/pipelines/datacomp/components/dummy/Dockerfile new file mode 100644 index 000000000..2378703ea --- /dev/null +++ b/examples/pipelines/datacomp/components/dummy/Dockerfile @@ -0,0 +1,23 @@ +FROM --platform=linux/amd64 python:3.8-slim + +# System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# Install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Install Fondant +# This is split from other requirements to leverage caching +ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244 +RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml new file mode 100644 index 000000000..346f3aa1b --- /dev/null +++ b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml @@ -0,0 +1,9 @@ +name: Dummy component +description: Component that downloads images based on URLs +image: ghcr.io/ml6team/dummy:cc418db0964ee202ebf2d4fd3aaee15604604244 + +consumes: + image: + fields: + data: + type: binary \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/dummy/requirements.txt b/examples/pipelines/datacomp/components/dummy/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pipelines/datacomp/components/dummy/src/main.py b/examples/pipelines/datacomp/components/dummy/src/main.py new file mode 100644 index 000000000..5700c1936 --- /dev/null +++ b/examples/pipelines/datacomp/components/dummy/src/main.py @@ -0,0 +1,26 @@ +""" +Dummy component for debugging. +""" +import logging + +import dask.dataframe as dd + +from fondant.component import DaskTransformComponent +from fondant.executor import DaskTransformExecutor + +logger = logging.getLogger(__name__) + + +class DummyComponent(DaskTransformComponent): + """Component that downloads images based on URLs.""" + + def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + + logger.info(f"Length of the dataframe: {len(dataframe)}") + + return dataframe + + +if __name__ == "__main__": + executor = DaskTransformExecutor.from_args() + executor.execute(DummyComponent) \ No newline at end of file diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index b2b609961..ab25454b6 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -15,8 +15,8 @@ pipeline = Pipeline( pipeline_name="datacomp-filtering-pipeline", pipeline_description="A pipeline for filtering the Datacomp dataset", - # base_path=PipelineConfigs.BASE_PATH, - base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", + base_path=PipelineConfigs.BASE_PATH, + # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp", ) client = Client(host=PipelineConfigs.HOST) @@ -63,13 +63,6 @@ "min_num_actions": 1, }, ) -cluster_image_embeddings_op = ComponentOp( - component_dir="components/cluster_image_embeddings", - arguments={ - "sample_ratio": 0.3, - "num_clusters": 3, - }, -) download_images_op = ComponentOp( component_dir="components/download_images", node_pool_label="node_pool", @@ -83,15 +76,27 @@ node_pool_name="model-inference-pool", output_partition_size="disable", ) +# dummpy_op = ComponentOp( +# component_dir="components/dummy", +# ) +detect_text_gpu_op = ComponentOp( + component_dir="components/detect_text_torch_gpu", + number_of_gpus=1, + node_pool_label="node_pool", + node_pool_name="model-inference-pool", + output_partition_size="disable", +) +# dummpy_op = ComponentOp( +# component_dir="components/dummy", +# ) + + # add ops to pipeline pipeline.add_op(load_from_hub_op) -# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op) -# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op) pipeline.add_op(download_images_op, dependencies=load_from_hub_op) -# pipeline.add_op(detect_text_op, dependencies=download_images_op) -# pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op) +pipeline.add_op(detect_text_gpu_op, dependencies=download_images_op) # TODO add more ops client.compile_and_run(pipeline=pipeline) \ No newline at end of file From be3e4b83dcde14f6c8a1b33078c99ead1d40655e Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Sun, 6 Aug 2023 10:23:41 +0200 Subject: [PATCH 61/65] Add print statement --- .../datacomp/components/detect_text_torch_gpu/Dockerfile | 2 +- .../detect_text_torch_gpu/fondant_component.yaml | 2 +- .../datacomp/components/download_images/Dockerfile | 2 +- .../components/download_images/fondant_component.yaml | 2 +- .../datacomp/components/download_images/src/main.py | 8 ++++---- src/fondant/data_io.py | 1 + 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile index ed1861562..54211fa15 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244 +ARG FONDANT_VERSION=fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml index fda4b0843..c75c80fc3 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text_torch_gpu:cc418db0964ee202ebf2d4fd3aaee15604604244 +image: ghcr.io/ml6team/detect_text_torch_gpu:fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 consumes: image: diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile index 5ff146228..e747f39e9 100644 --- a/examples/pipelines/datacomp/components/download_images/Dockerfile +++ b/examples/pipelines/datacomp/components/download_images/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=3d433d9e8dfeba967236445657dd8f415726de9a +ARG FONDANT_VERSION=fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 5e5965144..6d700e80c 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:cc418db0964ee202ebf2d4fd3aaee15604604244 +image: ghcr.io/ml6team/download_images:fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 consumes: image: diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index a7492f4c2..c21543f98 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -153,10 +153,10 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: # needs to be a dictionary with keys = column names, values = dtypes of columns # for each column in the output meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)} - meta["data"] = np.dtype(bytes) - meta["width"] = np.dtype(int) - meta["height"] = np.dtype(int) - + meta["data"] = bytes + meta["width"] = int + meta["height"] = int + dataframe = dataframe.map_partitions( download_image_with_retry_partition, timeout=self.timeout, diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 4fa442747..9683e7e3d 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -272,6 +272,7 @@ def _write_subset( location = self.manifest.subsets[subset_name].location schema = {field.name: field.type.value for field in subset_spec.fields.values()} + print(f"Schema of {subset_name}:", schema) return self._create_write_task(dataframe, location=location, schema=schema) From b40c7d9b579b0fc3531c871c82f39c1a395c2ee4 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Sun, 6 Aug 2023 11:30:55 +0200 Subject: [PATCH 62/65] Add more print statements --- .../datacomp/components/detect_text_torch_gpu/Dockerfile | 2 +- .../components/detect_text_torch_gpu/fondant_component.yaml | 2 +- .../datacomp/components/detect_text_torch_gpu/src/main.py | 3 +++ .../datacomp/components/download_images/src/main.py | 4 +++- examples/pipelines/datacomp/components/dummy/Dockerfile | 2 +- .../datacomp/components/dummy/fondant_component.yaml | 2 +- examples/pipelines/datacomp/components/dummy/src/main.py | 2 ++ examples/pipelines/datacomp/pipeline.py | 6 +++--- src/fondant/data_io.py | 3 +++ 9 files changed, 18 insertions(+), 8 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile index 54211fa15..6c3c2a504 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 +ARG FONDANT_VERSION=be3e4b83dcde14f6c8a1b33078c99ead1d40655e RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml index c75c80fc3..77a2590d2 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text_torch_gpu:fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 +image: ghcr.io/ml6team/detect_text_torch_gpu:be3e4b83dcde14f6c8a1b33078c99ead1d40655e consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py index c67e0d275..ef04907d9 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py @@ -120,6 +120,9 @@ def __init__(self, *args) -> None: def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: + # cast image_data to the right dtype + dataframe = dataframe.astype({'image_data': bytes}) + # create meta # needs to be a dictionary with keys = column names, values = dtypes of columns # for each column in the output diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py index c21543f98..82067c56d 100644 --- a/examples/pipelines/datacomp/components/download_images/src/main.py +++ b/examples/pipelines/datacomp/components/download_images/src/main.py @@ -156,7 +156,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: meta["data"] = bytes meta["width"] = int meta["height"] = int - + dataframe = dataframe.map_partitions( download_image_with_retry_partition, timeout=self.timeout, @@ -166,9 +166,11 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: ) # rename new columns to be conform the spec + logger.info("Renaming columns...") dataframe = dataframe.rename(columns={"data": "image_data", "width": "image_width", "height":"image_height"}) # Remove images that could not be fetched + logger.info("Dropping invalid rows...") dataframe = dataframe.dropna() logger.info(f"Length of the final dataframe: {len(dataframe)}") diff --git a/examples/pipelines/datacomp/components/dummy/Dockerfile b/examples/pipelines/datacomp/components/dummy/Dockerfile index 2378703ea..d40d75e77 100644 --- a/examples/pipelines/datacomp/components/dummy/Dockerfile +++ b/examples/pipelines/datacomp/components/dummy/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244 +ARG FONDANT_VERSION=be3e4b83dcde14f6c8a1b33078c99ead1d40655e RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml index 346f3aa1b..ee39a4ac5 100644 --- a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml @@ -1,6 +1,6 @@ name: Dummy component description: Component that downloads images based on URLs -image: ghcr.io/ml6team/dummy:cc418db0964ee202ebf2d4fd3aaee15604604244 +image: ghcr.io/ml6team/dummy:be3e4b83dcde14f6c8a1b33078c99ead1d40655e consumes: image: diff --git a/examples/pipelines/datacomp/components/dummy/src/main.py b/examples/pipelines/datacomp/components/dummy/src/main.py index 5700c1936..929d04245 100644 --- a/examples/pipelines/datacomp/components/dummy/src/main.py +++ b/examples/pipelines/datacomp/components/dummy/src/main.py @@ -17,6 +17,8 @@ class DummyComponent(DaskTransformComponent): def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: logger.info(f"Length of the dataframe: {len(dataframe)}") + print("Columns of the dataframe:", dataframe.columns) + print("Dyptes of the dataframe:", dataframe.dtypes) return dataframe diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index ab25454b6..5fed49cae 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -86,9 +86,9 @@ node_pool_name="model-inference-pool", output_partition_size="disable", ) -# dummpy_op = ComponentOp( -# component_dir="components/dummy", -# ) +dummpy_op = ComponentOp( + component_dir="components/dummy", +) diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py index 9683e7e3d..25f92be41 100644 --- a/src/fondant/data_io.py +++ b/src/fondant/data_io.py @@ -205,6 +205,9 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None: logger.info("Creating write tasks...") + print("Dataframe columns:", dataframe.columns) + print("Dataframe dtypes:", dataframe.dtypes) + # Turn index into an empty dataframe so we can write it index_df = dataframe.index.to_frame().drop(columns=["id"]) write_index_task = self._write_subset( From 2cee54b7804817bdc51facb9b9781233a4ea6ea7 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Sun, 6 Aug 2023 12:27:03 +0200 Subject: [PATCH 63/65] More improvements --- .../datacomp/components/detect_text_torch_gpu/Dockerfile | 2 +- .../components/detect_text_torch_gpu/fondant_component.yaml | 2 +- .../datacomp/components/detect_text_torch_gpu/src/main.py | 5 ++++- .../pipelines/datacomp/components/download_images/Dockerfile | 2 +- .../components/download_images/fondant_component.yaml | 2 +- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile index 6c3c2a504..a2adc1c4f 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=be3e4b83dcde14f6c8a1b33078c99ead1d40655e +ARG FONDANT_VERSION=b40c7d9b579b0fc3531c871c82f39c1a395c2ee4 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml index 77a2590d2..20b642bfc 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text_torch_gpu:be3e4b83dcde14f6c8a1b33078c99ead1d40655e +image: ghcr.io/ml6team/detect_text_torch_gpu:b40c7d9b579b0fc3531c871c82f39c1a395c2ee4 consumes: image: diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py index ef04907d9..f3698efee 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py @@ -128,7 +128,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: # for each column in the output meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)} meta["image_data"] = bytes - meta["image_boxes"] = np.dtype(object) + meta["image_boxes"] = np.dtype(np.int64) logger.info("Detecting texts..") dataframe = dataframe.map_partitions( @@ -137,6 +137,9 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: meta=meta, ) + # cast image_data to the right dtype + dataframe = dataframe.astype({'image_data': bytes, 'image_boxes': np.dtype(np.int64)}) + return dataframe diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile index e747f39e9..5dfff914c 100644 --- a/examples/pipelines/datacomp/components/download_images/Dockerfile +++ b/examples/pipelines/datacomp/components/download_images/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 +ARG FONDANT_VERSION=b40c7d9b579b0fc3531c871c82f39c1a395c2ee4 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml index 6d700e80c..76504f1dc 100644 --- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 +image: ghcr.io/ml6team/download_images:b40c7d9b579b0fc3531c871c82f39c1a395c2ee4 consumes: image: From 39c56436e20fb920a50c26a4d0753251993f3251 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 8 Aug 2023 10:04:24 +0200 Subject: [PATCH 64/65] Remove dummy op --- .../detect_text_torch_gpu/Dockerfile | 2 +- .../fondant_component.yaml | 2 +- .../datacomp/components/dummy/Dockerfile | 23 --------------- .../components/dummy/fondant_component.yaml | 9 ------ .../components/dummy/requirements.txt | 0 .../datacomp/components/dummy/src/main.py | 28 ------------------- examples/pipelines/datacomp/pipeline.py | 8 ------ 7 files changed, 2 insertions(+), 70 deletions(-) delete mode 100644 examples/pipelines/datacomp/components/dummy/Dockerfile delete mode 100644 examples/pipelines/datacomp/components/dummy/fondant_component.yaml delete mode 100644 examples/pipelines/datacomp/components/dummy/requirements.txt delete mode 100644 examples/pipelines/datacomp/components/dummy/src/main.py diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile index a2adc1c4f..52a84d19f 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile @@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=b40c7d9b579b0fc3531c871c82f39c1a395c2ee4 +ARG FONDANT_VERSION=2cee54b7804817bdc51facb9b9781233a4ea6ea7 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml index 20b642bfc..2fb1d525c 100644 --- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text_torch_gpu:b40c7d9b579b0fc3531c871c82f39c1a395c2ee4 +image: ghcr.io/ml6team/detect_text_torch_gpu:2cee54b7804817bdc51facb9b9781233a4ea6ea7 consumes: image: diff --git a/examples/pipelines/datacomp/components/dummy/Dockerfile b/examples/pipelines/datacomp/components/dummy/Dockerfile deleted file mode 100644 index d40d75e77..000000000 --- a/examples/pipelines/datacomp/components/dummy/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -FROM --platform=linux/amd64 python:3.8-slim - -# System dependencies -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install git -y - -# Install requirements -COPY requirements.txt / -RUN pip3 install --no-cache-dir -r requirements.txt - -# Install Fondant -# This is split from other requirements to leverage caching -ARG FONDANT_VERSION=be3e4b83dcde14f6c8a1b33078c99ead1d40655e -RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} - -# Set the working directory to the component folder -WORKDIR /component/src - -# Copy over src-files -COPY src/ . - -ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml deleted file mode 100644 index ee39a4ac5..000000000 --- a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name: Dummy component -description: Component that downloads images based on URLs -image: ghcr.io/ml6team/dummy:be3e4b83dcde14f6c8a1b33078c99ead1d40655e - -consumes: - image: - fields: - data: - type: binary \ No newline at end of file diff --git a/examples/pipelines/datacomp/components/dummy/requirements.txt b/examples/pipelines/datacomp/components/dummy/requirements.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/pipelines/datacomp/components/dummy/src/main.py b/examples/pipelines/datacomp/components/dummy/src/main.py deleted file mode 100644 index 929d04245..000000000 --- a/examples/pipelines/datacomp/components/dummy/src/main.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -Dummy component for debugging. -""" -import logging - -import dask.dataframe as dd - -from fondant.component import DaskTransformComponent -from fondant.executor import DaskTransformExecutor - -logger = logging.getLogger(__name__) - - -class DummyComponent(DaskTransformComponent): - """Component that downloads images based on URLs.""" - - def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame: - - logger.info(f"Length of the dataframe: {len(dataframe)}") - print("Columns of the dataframe:", dataframe.columns) - print("Dyptes of the dataframe:", dataframe.dtypes) - - return dataframe - - -if __name__ == "__main__": - executor = DaskTransformExecutor.from_args() - executor.execute(DummyComponent) \ No newline at end of file diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 5fed49cae..8ead7d553 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -76,9 +76,6 @@ node_pool_name="model-inference-pool", output_partition_size="disable", ) -# dummpy_op = ComponentOp( -# component_dir="components/dummy", -# ) detect_text_gpu_op = ComponentOp( component_dir="components/detect_text_torch_gpu", number_of_gpus=1, @@ -86,11 +83,6 @@ node_pool_name="model-inference-pool", output_partition_size="disable", ) -dummpy_op = ComponentOp( - component_dir="components/dummy", -) - - # add ops to pipeline From 8ce9b372b58875efc9c5b41aa5d82fb69b898d53 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 8 Aug 2023 10:30:31 +0200 Subject: [PATCH 65/65] Update dockerfile --- examples/pipelines/datacomp/components/detect_text/Dockerfile | 2 +- .../datacomp/components/detect_text/fondant_component.yaml | 2 +- examples/pipelines/datacomp/pipeline.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile index ad09d730e..093e48947 100644 --- a/examples/pipelines/datacomp/components/detect_text/Dockerfile +++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile @@ -12,7 +12,7 @@ RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/ # Install Fondant # This is split from other requirements to leverage caching -ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244 +ARG FONDANT_VERSION=39c56436e20fb920a50c26a4d0753251993f3251 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION} # Set the working directory to the component folder diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml index 357feb55f..b7364d946 100644 --- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml +++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml @@ -1,6 +1,6 @@ name: Detect text description: Component that detects text in images -image: ghcr.io/ml6team/detect_text:cc418db0964ee202ebf2d4fd3aaee15604604244 +image: ghcr.io/ml6team/detect_text:39c56436e20fb920a50c26a4d0753251993f3251 consumes: image: diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py index 8ead7d553..1455da71d 100644 --- a/examples/pipelines/datacomp/pipeline.py +++ b/examples/pipelines/datacomp/pipeline.py @@ -88,7 +88,7 @@ # add ops to pipeline pipeline.add_op(load_from_hub_op) pipeline.add_op(download_images_op, dependencies=load_from_hub_op) -pipeline.add_op(detect_text_gpu_op, dependencies=download_images_op) +pipeline.add_op(detect_text_op, dependencies=download_images_op) # TODO add more ops client.compile_and_run(pipeline=pipeline) \ No newline at end of file