From 7dcd1170421b07ccfd2441255b24c0a6724a382c Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 25 Jul 2023 13:36:27 +0200
Subject: [PATCH 01/65] More fixes

---
 components/filter_image_resolution/Dockerfile       |  2 +-
 .../filter_image_resolution/fondant_component.yaml  |  2 +-
 .../components/cluster_image_embeddings/Dockerfile  |  2 +-
 .../components/cluster_image_embeddings/src/main.py |  2 +-
 .../components/filter_text_complexity/Dockerfile    |  2 +-
 examples/pipelines/datacomp/pipeline.py             | 13 +------------
 scripts/build_components.sh                         |  2 +-
 7 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile
index abfa9a414..e36badeaf 100644
--- a/components/filter_image_resolution/Dockerfile
+++ b/components/filter_image_resolution/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
+ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml
index dcac31145..f54507827 100644
--- a/components/filter_image_resolution/fondant_component.yaml
+++ b/components/filter_image_resolution/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Filter image resolution
 description: Component that filters images based on minimum size and max aspect ratio
-image: ghcr.io/ml6team/filter_image_resolution:latest
+image: ghcr.io/ml6team/filter_image_resolution:79df895e9d62d2010ccb8d40ee7e4fd4c68f117d
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/cluster_image_embeddings/Dockerfile b/examples/pipelines/datacomp/components/cluster_image_embeddings/Dockerfile
index abfa9a414..e36badeaf 100644
--- a/examples/pipelines/datacomp/components/cluster_image_embeddings/Dockerfile
+++ b/examples/pipelines/datacomp/components/cluster_image_embeddings/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
+ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/cluster_image_embeddings/src/main.py b/examples/pipelines/datacomp/components/cluster_image_embeddings/src/main.py
index c2ee12c51..6d9b7420c 100644
--- a/examples/pipelines/datacomp/components/cluster_image_embeddings/src/main.py
+++ b/examples/pipelines/datacomp/components/cluster_image_embeddings/src/main.py
@@ -16,7 +16,7 @@
 class ClusterImageEmbeddingsComponent(DaskTransformComponent):
     """Component that clusters images based on embeddings."""
 
-    def __init__(self, sample_ratio: float, num_clusters: int) -> None:
+    def __init__(self, *_, sample_ratio: float, num_clusters: int) -> None:
         self.sample_ratio = sample_ratio
         self.num_clusters = num_clusters
 
diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
index 397a5f37b..610851a01 100644
--- a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
+++ b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
@@ -12,7 +12,7 @@ RUN python -m spacy download en_core_web_sm
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
+ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 9af28c365..f7a922690 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -7,14 +7,13 @@
 
 from pipeline_configs import PipelineConfigs
 
-from fondant.compiler import DockerCompiler
 from fondant.pipeline import ComponentOp, Pipeline, Client
 
 logger = logging.getLogger(__name__)
 
 # Initialize pipeline and client
 pipeline = Pipeline(
-    pipeline_name="Datacomp filtering pipeline",
+    pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
     # base_path=PipelineConfigs.BASE_PATH,
     base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
@@ -69,13 +68,3 @@
 pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
-
-# compile
-if __name__ == "__main__":
-    compiler = DockerCompiler()
-    # mount the gcloud credentials to the container
-    extra_volumes = [
-        "$HOME/.config/gcloud/application_default_credentials.json:/root/.config/gcloud/application_default_credentials.json:ro"
-    ]
-    compiler.compile(pipeline=pipeline, extra_volumes=extra_volumes)
-    logger.info("Run `docker compose up` to run the pipeline.")
diff --git a/scripts/build_components.sh b/scripts/build_components.sh
index a78544802..265d08b83 100755
--- a/scripts/build_components.sh
+++ b/scripts/build_components.sh
@@ -7,7 +7,7 @@ function usage {
   echo "  -t,  --tag <value>                 Tag to add to image, repeatable
                                              The first tag is set in the component specifications"
   echo "  -c,  --cache <value>               Use registry caching when building the components (default:false)"
-  echo "  -d,  --component-dirs <value>      Directory containing components to build as subdirectories.
+  echo "  -d,  --components-dir <value>      Directory containing components to build as subdirectories.
                                              The path should be relative to the root directory (default:components)"
   echo "  -n, --namespace <value>            The namespace for the built images, should match the github organization (default: ml6team)"
   echo "  -co, --component <value>           Specific component to build. Pass the component subdirectory name(s) to build

From 05c71564ae873a6454595925f574f5e03c09f3d8 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 25 Jul 2023 16:13:05 +0200
Subject: [PATCH 02/65] More improvements

---
 .../components/load_from_hf_hub/fondant_component.yaml |  2 +-
 examples/pipelines/datacomp/pipeline.py                | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index b89a10324..6eb56e741 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:dev
+image: ghcr.io/ml6team/load_from_hf_hub:56b83265bc80e0f98559e58d01d1f18575b85d6b
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index f7a922690..52763a711 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -38,7 +38,7 @@
     arguments={
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 100,
+        "n_rows_to_load": 500000,
     },
 )
 filter_image_resolution_op = ComponentOp.from_registry(
@@ -64,7 +64,9 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
-pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
-pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
+# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
+# pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
+
+# client.compile_and_run(pipeline=pipeline)
\ No newline at end of file

From aff3fee38324256cfbc678f6b81d00a88340e967 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 25 Jul 2023 16:13:57 +0200
Subject: [PATCH 03/65] More improvements

---
 components/load_from_hf_hub/fondant_component.yaml | 2 +-
 components/load_from_hf_hub/src/main.py            | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 0099a92f8..3d6741e46 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:dev
+image: ghcr.io/ml6team/load_from_hf_hub:56b83265bc80e0f98559e58d01d1f18575b85d6b
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index a55daf0c9..7b3aaf0c6 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -48,8 +48,7 @@ def load(self) -> dd.DataFrame:
 
         # 4) Optional: only return specific amount of rows
         if self.n_rows_to_load:
-            dask_df = dask_df.head(self.n_rows_to_load)
-            dask_df = dd.from_pandas(dask_df, npartitions=1)
+            dask_df = dask_df.loc[:self.n_rows_to_load]
 
         return dask_df
 

From 710685bfed896e04726af6371a20f778eda5ab2e Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 25 Jul 2023 16:33:26 +0200
Subject: [PATCH 04/65] Add logging

---
 components/load_from_hf_hub/Dockerfile  | 2 +-
 components/load_from_hf_hub/src/main.py | 4 +++-
 src/fondant/data_io.py                  | 6 +++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index abfa9a414..ec5c49504 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
+ARG FONDANT_VERSION=56b83265bc80e0f98559e58d01d1f18575b85d6b
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index 7b3aaf0c6..09b9117d6 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -44,10 +44,12 @@ def load(self) -> dd.DataFrame:
                 )
 
         # 3) Rename columns
+        logger.info("Renaming columns...")
         dask_df = dask_df.rename(columns=self.column_name_mapping)
 
         # 4) Optional: only return specific amount of rows
-        if self.n_rows_to_load:
+        if self.n_rows_to_load is not None:
+            logger.info(f"Loading first {self.n_rows_to_load} rows...")
             dask_df = dask_df.loc[:self.n_rows_to_load]
 
         return dask_df
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 4d8b116d8..409945eac 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -193,6 +193,10 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
         write_tasks = []
 
         dataframe = self.partition_written_dataframe(dataframe)
+        
+        logger.info("Dataframe number of partitions:", dataframe.npartitions)
+
+        logger.info("Creating write tasks...")
 
         dataframe.index = dataframe.index.rename("id").astype("string")
 
@@ -286,6 +290,7 @@ def _create_write_task(
              A delayed Dask task that uploads the DataFrame to the remote storage location when
               executed.
         """
+        logging.info(f"Creating write task for: {location}")
         write_task = dd.to_parquet(
             dataframe,
             location,
@@ -293,5 +298,4 @@ def _create_write_task(
             overwrite=False,
             compute=False,
         )
-        logging.info(f"Creating write task for: {location}")
         return write_task

From c070babd930ed0945ab04e802242c21844443a2b Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 25 Jul 2023 16:43:02 +0200
Subject: [PATCH 05/65] Update dockerfile

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index ec5c49504..6a06fb3fc 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=56b83265bc80e0f98559e58d01d1f18575b85d6b
+ARG FONDANT_VERSION=6b2843e3c5c9e2df22ac24a560216eb89211f9ee
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 3d6741e46..92e53c66a 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:56b83265bc80e0f98559e58d01d1f18575b85d6b
+image: ghcr.io/ml6team/load_from_hf_hub:6b2843e3c5c9e2df22ac24a560216eb89211f9ee
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 6eb56e741..18af6c11e 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:56b83265bc80e0f98559e58d01d1f18575b85d6b
+image: ghcr.io/ml6team/load_from_hf_hub:6b2843e3c5c9e2df22ac24a560216eb89211f9ee
 
 produces:
   image:

From 698b92caeae5f3ed6296e580cbe90b5941de33d6 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 25 Jul 2023 17:00:51 +0200
Subject: [PATCH 06/65] Fix dtype

---
 .../components/load_from_hf_hub/fondant_component.yaml      | 4 ++--
 examples/pipelines/datacomp/pipeline.py                     | 6 +++---
 src/fondant/schemas/common.json                             | 1 +
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 18af6c11e..9b58d5364 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -8,9 +8,9 @@ produces:
       url:
         type: string
       width:
-        type: int16
+        type: int64
       height:
-        type: int16
+        type: int64
       face_bboxes:
         type: array
         items:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 52763a711..4035b66c2 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -69,4 +69,4 @@
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 
-# client.compile_and_run(pipeline=pipeline)
\ No newline at end of file
+client.compile_and_run(pipeline=pipeline)
\ No newline at end of file
diff --git a/src/fondant/schemas/common.json b/src/fondant/schemas/common.json
index 969ecd1a8..11df4e988 100644
--- a/src/fondant/schemas/common.json
+++ b/src/fondant/schemas/common.json
@@ -7,6 +7,7 @@
         "int8",
         "int16",
         "int32",
+        "int64",
         "uint8",
         "uint16",
         "uint32",

From 6ed53842308e2d6411946f9f2cb58231aae69996 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 25 Jul 2023 17:35:32 +0200
Subject: [PATCH 07/65] Update Dockerfile

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 6a06fb3fc..662017ba7 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=6b2843e3c5c9e2df22ac24a560216eb89211f9ee
+ARG FONDANT_VERSION=8174eb4a23b742f64a837daf6b5c5b929d70db61
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 92e53c66a..ed83827a6 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:6b2843e3c5c9e2df22ac24a560216eb89211f9ee
+image: ghcr.io/ml6team/load_from_hf_hub:8174eb4a23b742f64a837daf6b5c5b929d70db61
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 9b58d5364..4fcafd38c 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:6b2843e3c5c9e2df22ac24a560216eb89211f9ee
+image: ghcr.io/ml6team/load_from_hf_hub:8174eb4a23b742f64a837daf6b5c5b929d70db61
 
 produces:
   image:

From f253d9c65412aabc4ce0c745fd5bd2f5bd7f88ed Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 08:59:45 +0200
Subject: [PATCH 08/65] More updates

---
 components/load_from_hf_hub/Dockerfile                     | 2 +-
 components/load_from_hf_hub/fondant_component.yaml         | 2 +-
 .../components/load_from_hf_hub/fondant_component.yaml     | 2 +-
 examples/pipelines/datacomp/pipeline.py                    | 7 ++++---
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 662017ba7..55ec91ab6 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=8174eb4a23b742f64a837daf6b5c5b929d70db61
+ARG FONDANT_VERSION=42bb3f62a4411108e88e6fdc353494cf8fe9d367
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index ed83827a6..165a1793f 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:8174eb4a23b742f64a837daf6b5c5b929d70db61
+image: ghcr.io/ml6team/load_from_hf_hub:42bb3f62a4411108e88e6fdc353494cf8fe9d367
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 4fcafd38c..ea7537864 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:8174eb4a23b742f64a837daf6b5c5b929d70db61
+image: ghcr.io/ml6team/load_from_hf_hub:42bb3f62a4411108e88e6fdc353494cf8fe9d367
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 4035b66c2..bc24046ba 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -37,6 +37,7 @@
     component_dir="components/load_from_hf_hub",
     arguments={
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
+        "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
         "n_rows_to_load": 500000,
     },
@@ -69,4 +70,4 @@
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 
-client.compile_and_run(pipeline=pipeline)
\ No newline at end of file
+# client.compile_and_run(pipeline=pipeline)
\ No newline at end of file

From 2c990c40ac857bf7a7cb42d09627e6ef907276be Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 09:08:53 +0200
Subject: [PATCH 09/65] Update logging

---
 examples/pipelines/datacomp/pipeline.py | 2 +-
 src/fondant/data_io.py                  | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index bc24046ba..c6b5b3d78 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,7 +39,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 500000,
+        "n_rows_to_load": 10,
     },
 )
 filter_image_resolution_op = ComponentOp.from_registry(
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 409945eac..8b3123067 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -163,20 +163,19 @@ def partition_written_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         """
         if self.output_partition_size != "disable":
             if isinstance(self.output_partition_size, str):
-                dataframe = dataframe.repartition(
-                    partition_size=self.output_partition_size,
-                )
                 logger.info(
                     f"Repartitioning the written data such that the size per partition is approx."
                     f" {self.output_partition_size}",
                 )
-
+                dataframe = dataframe.repartition(
+                    partition_size=self.output_partition_size,
+                )
             elif self.output_partition_size is None:
-                dataframe = dataframe.repartition(partition_size="250MB")
                 logger.info(
                     "Repartitioning the written data such that the size per partition is approx."
                     " 250MB. (Automatic repartitioning)",
                 )
+                dataframe = dataframe.repartition(partition_size="250MB") 
             else:
                 msg = (
                     f"{self.output_partition_size} is not a valid argument. Choose either the"

From 87df957d892a95d34e5828a3490b6157dc8bb3f7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 10:35:12 +0200
Subject: [PATCH 10/65] More improvements

---
 components/load_from_hf_hub/Dockerfile                        | 2 +-
 components/load_from_hf_hub/fondant_component.yaml            | 2 +-
 components/load_from_hf_hub/src/main.py                       | 3 ++-
 .../components/load_from_hf_hub/fondant_component.yaml        | 2 +-
 examples/pipelines/datacomp/pipeline.py                       | 4 ++--
 src/fondant/data_io.py                                        | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 55ec91ab6..5c85e3b44 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=42bb3f62a4411108e88e6fdc353494cf8fe9d367
+ARG FONDANT_VERSION=a8ba56e7e38468872eb4a9829c77b2b1aa2003e0
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 165a1793f..8e2d92b3a 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:42bb3f62a4411108e88e6fdc353494cf8fe9d367
+image: ghcr.io/ml6team/load_from_hf_hub:a8ba56e7e38468872eb4a9829c77b2b1aa2003e0
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index 09b9117d6..f3931f303 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -50,7 +50,8 @@ def load(self) -> dd.DataFrame:
         # 4) Optional: only return specific amount of rows
         if self.n_rows_to_load is not None:
             logger.info(f"Loading first {self.n_rows_to_load} rows...")
-            dask_df = dask_df.loc[:self.n_rows_to_load]
+            dask_df = dask_df.head(self.n_rows_to_load)
+            dask_df = dd.from_pandas(dask_df, npartitions=1)
 
         return dask_df
 
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index ea7537864..4ebeb261e 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:42bb3f62a4411108e88e6fdc353494cf8fe9d367
+image: ghcr.io/ml6team/load_from_hf_hub:a8ba56e7e38468872eb4a9829c77b2b1aa2003e0
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index c6b5b3d78..c597b5e60 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,7 +39,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 10,
+        "n_rows_to_load": 500000,
     },
 )
 filter_image_resolution_op = ComponentOp.from_registry(
@@ -70,4 +70,4 @@
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 
-# client.compile_and_run(pipeline=pipeline)
\ No newline at end of file
+client.compile_and_run(pipeline=pipeline)
\ No newline at end of file
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 8b3123067..2f97252fb 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -193,7 +193,7 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
 
         dataframe = self.partition_written_dataframe(dataframe)
         
-        logger.info("Dataframe number of partitions:", dataframe.npartitions)
+        logger.info(f"Dataframe number of partitions is {dataframe.npartitions}")
 
         logger.info("Creating write tasks...")
 

From f702be1c7a981840aacad77fcbcf1369f890d7cb Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 10:44:22 +0200
Subject: [PATCH 11/65] Update specs

---
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 8e2d92b3a..f1d849c57 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:a8ba56e7e38468872eb4a9829c77b2b1aa2003e0
+image: ghcr.io/ml6team/load_from_hf_hub:1f1ab27717d70a135f825d9fb97fb1ed038262c9
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 4ebeb261e..712fb48e3 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:a8ba56e7e38468872eb4a9829c77b2b1aa2003e0
+image: ghcr.io/ml6team/load_from_hf_hub:1f1ab27717d70a135f825d9fb97fb1ed038262c9
 
 produces:
   image:

From 7c19cc74da09203a113ff0ffa3b9e43d846c5c63 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 11:08:47 +0200
Subject: [PATCH 12/65] Improve load_from_hf_hub component

---
 components/load_from_hf_hub/src/main.py | 12 +++++++++++-
 examples/pipelines/datacomp/pipeline.py |  2 ++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index f3931f303..e7d2c447b 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -16,6 +16,7 @@ def __init__(self, *_,
              column_name_mapping: dict,
              image_column_names: t.Optional[list],
              n_rows_to_load: t.Optional[int],
+             dataset_length: int,
     ) -> None:
         """
         Args:
@@ -25,11 +26,14 @@ def __init__(self, *_,
                 format the image from HF hub format to a byte string
             n_rows_to_load: optional argument that defines the number of rows to load. Useful for
               testing pipeline runs on a small scale.
+            dataset_length: optional argument that specifies the length of the entire dataset. Only
+            required in case n_rows_to_load is specified.
         """
         self.dataset_name = dataset_name
         self.column_name_mapping = column_name_mapping
         self.image_column_names = image_column_names
         self.n_rows_to_load = n_rows_to_load
+        self.dataset_length = dataset_length
 
     def load(self) -> dd.DataFrame:
         # 1) Load data, read as Dask dataframe
@@ -49,8 +53,14 @@ def load(self) -> dd.DataFrame:
 
         # 4) Optional: only return specific amount of rows
         if self.n_rows_to_load is not None:
+            if self.dataset_length is None:
+                raise ValueError("""Make sure to also specify the length of the entire
+                                 dataset. This is required as otherwise only the first
+                                 partition can be loaded""")
             logger.info(f"Loading first {self.n_rows_to_load} rows...")
-            dask_df = dask_df.head(self.n_rows_to_load)
+            partition_length = self.dataset_length // dask_df.npartitions
+            npartitions = self.n_rows_to_load // partition_length
+            dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions)
             dask_df = dd.from_pandas(dask_df, npartitions=1)
 
         return dask_df
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index c597b5e60..43bec4068 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -40,7 +40,9 @@
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
         "n_rows_to_load": 500000,
+        "dataset_length": 12800000,
     },
+    node_pool_name="n2-standard-128-pool",
 )
 filter_image_resolution_op = ComponentOp.from_registry(
     name="filter_image_resolution",

From 08bc45a01314eeaab16cb084209133eee7c900e3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 11:43:08 +0200
Subject: [PATCH 13/65] Update specs

---
 components/filter_image_resolution/Dockerfile            | 2 +-
 .../filter_image_resolution/fondant_component.yaml       | 6 +++---
 components/load_from_hf_hub/fondant_component.yaml       | 9 +++++++--
 components/load_from_hf_hub/src/main.py                  | 2 +-
 .../components/load_from_hf_hub/fondant_component.yaml   | 6 +++++-
 examples/pipelines/datacomp/pipeline.py                  | 6 +++---
 6 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile
index e36badeaf..bc16d1c64 100644
--- a/components/filter_image_resolution/Dockerfile
+++ b/components/filter_image_resolution/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d
+ARG FONDANT_VERSION=41bd8f9d8f8003b41ffa375c1887869208e519ea
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml
index f54507827..096bfb730 100644
--- a/components/filter_image_resolution/fondant_component.yaml
+++ b/components/filter_image_resolution/fondant_component.yaml
@@ -1,14 +1,14 @@
 name: Filter image resolution
 description: Component that filters images based on minimum size and max aspect ratio
-image: ghcr.io/ml6team/filter_image_resolution:79df895e9d62d2010ccb8d40ee7e4fd4c68f117d
+image: ghcr.io/ml6team/filter_image_resolution:41bd8f9d8f8003b41ffa375c1887869208e519ea
 
 consumes:
   image:
     fields:
       width:
-        type: int16
+        type: int64
       height:
-        type: int16
+        type: int64
 
 args:
   min_image_dim:
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index f1d849c57..5bfacee04 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:1f1ab27717d70a135f825d9fb97fb1ed038262c9
+image: ghcr.io/ml6team/load_from_hf_hub:41bd8f9d8f8003b41ffa375c1887869208e519ea
 
 produces:
   dummy_variable:  #TODO: fill in here
@@ -23,4 +23,9 @@ args:
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: None
\ No newline at end of file
+    default: None
+  dataset_length:
+    description: Optional argument that defines the length of the dataset. Required in case `n_rows_to_load` is specified.
+    type: int
+    default: None
+  
\ No newline at end of file
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index e7d2c447b..8fe5266b8 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -57,7 +57,7 @@ def load(self) -> dd.DataFrame:
                 raise ValueError("""Make sure to also specify the length of the entire
                                  dataset. This is required as otherwise only the first
                                  partition can be loaded""")
-            logger.info(f"Loading first {self.n_rows_to_load} rows...")
+            logger.info(f"Loading approximately {self.n_rows_to_load} rows...")
             partition_length = self.dataset_length // dask_df.npartitions
             npartitions = self.n_rows_to_load // partition_length
             dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions)
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 712fb48e3..af70ce67d 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:1f1ab27717d70a135f825d9fb97fb1ed038262c9
+image: ghcr.io/ml6team/load_from_hf_hub:41bd8f9d8f8003b41ffa375c1887869208e519ea
 
 produces:
   image:
@@ -51,4 +51,8 @@ args:
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
+    default: None
+  dataset_length:
+    description: Optional argument that defines the length of the dataset. Required in case `n_rows_to_load` is specified.
+    type: int
     default: None
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 43bec4068..5dccfa176 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -67,7 +67,7 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops

From 0912acc0a3b6516278df6324bfff30ca17c119a0 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 13:28:06 +0200
Subject: [PATCH 14/65] Add task graph

---
 examples/pipelines/datacomp/pipeline.py | 6 ++++--
 src/fondant/data_io.py                  | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 5dccfa176..84364ae7e 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,8 +39,10 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 500000,
+        "n_rows_to_load": 10,
         "dataset_length": 12800000,
+        # "n_rows_to_load": 500000,
+        # "dataset_length": 12800000,
     },
     node_pool_name="n2-standard-128-pool",
 )
@@ -67,7 +69,7 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 2f97252fb..71eba6e58 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -2,6 +2,7 @@
 import os
 import typing as t
 
+import dask
 import dask.dataframe as dd
 from dask.diagnostics import ProgressBar
 
@@ -223,6 +224,8 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
 
         with ProgressBar():
             logging.info("Writing data...")
+            # visualize the low level Dask graph
+            dask.visualize(*write_tasks, filename='task_graph.png')
             dd.compute(*write_tasks)
 
     @staticmethod

From 9d5208de14aa71965e5f81ff979ac2a428aef09f Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 13:36:12 +0200
Subject: [PATCH 15/65] Add graphviz to the dependencies

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 examples/pipelines/datacomp/pipeline.py                         | 2 +-
 pyproject.toml                                                  | 1 +
 5 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 5c85e3b44..4677f2b9e 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=a8ba56e7e38468872eb4a9829c77b2b1aa2003e0
+ARG FONDANT_VERSION=c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 5bfacee04..05343e40d 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:41bd8f9d8f8003b41ffa375c1887869208e519ea
+image: ghcr.io/ml6team/load_from_hf_hub:c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index af70ce67d..ac9fa1a3a 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:41bd8f9d8f8003b41ffa375c1887869208e519ea
+image: ghcr.io/ml6team/load_from_hf_hub:c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 84364ae7e..6b1431d67 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,7 +39,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 10,
+        "n_rows_to_load": 100000,
         "dataset_length": 12800000,
         # "n_rows_to_load": 500000,
         # "dataset_length": 12800000,
diff --git a/pyproject.toml b/pyproject.toml
index 7b837de9f..b074eb7a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ dask = {extras = ["dataframe"], version = ">= 2023.4.1"}
 importlib-resources = { version = ">= 1.3", python = "<3.9" }
 jsonschema = ">= 4.18"
 pyarrow = ">= 11.0.0"
+graphviz = ">= 0.20.1"
 
 fsspec = { version = ">= 2023.4.0", optional = true}
 gcsfs = { version = ">= 2023.4.0", optional = true }

From 0d42734a2fd8d2c4ee1c6340b59fddd0bae93f9c Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 14:02:31 +0200
Subject: [PATCH 16/65] Update Dockerfile

---
 components/load_from_hf_hub/Dockerfile                       | 5 +++--
 components/load_from_hf_hub/fondant_component.yaml           | 2 +-
 .../components/load_from_hf_hub/fondant_component.yaml       | 2 +-
 examples/pipelines/datacomp/pipeline.py                      | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 4677f2b9e..34c2e3a05 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -3,7 +3,8 @@ FROM --platform=linux/amd64 python:3.8-slim
 # System dependencies
 RUN apt-get update && \
     apt-get upgrade -y && \
-    apt-get install git -y
+    apt-get install git -y && \
+    apt-get install graphviz -y
 
 # Install requirements
 COPY requirements.txt /
@@ -11,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769
+ARG FONDANT_VERSION=797f1a81694e4d66c4fe39edbbc9fc2dafce830a
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 05343e40d..9310e76d6 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769
+image: ghcr.io/ml6team/load_from_hf_hub:797f1a81694e4d66c4fe39edbbc9fc2dafce830a
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index ac9fa1a3a..34b9b5ee5 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:c09a9e3ac49f8ecc35bf74ea550ce841d3ed3769
+image: ghcr.io/ml6team/load_from_hf_hub:797f1a81694e4d66c4fe39edbbc9fc2dafce830a
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 6b1431d67..ae14144d5 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 

From 11a4ec75d8d302c639cc2c1d75001b870e0d35bd Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 14:05:40 +0200
Subject: [PATCH 17/65] Add more

---
 src/fondant/data_io.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 71eba6e58..4a52aa540 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -225,7 +225,10 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
         with ProgressBar():
             logging.info("Writing data...")
             # visualize the low level Dask graph
+            logging.info("Visualizing task graph...")
             dask.visualize(*write_tasks, filename='task_graph.png')
+            for i, task in enumerate(write_tasks):
+                task.visualize(filename=f'task_{i}.svg')
             dd.compute(*write_tasks)
 
     @staticmethod

From 86a133678ba8727393dca1bc1d4c91eac2668e22 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 14:58:19 +0200
Subject: [PATCH 18/65] Add visualize

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 src/fondant/data_io.py                                          | 2 ++
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 34c2e3a05..a3637ad4b 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=797f1a81694e4d66c4fe39edbbc9fc2dafce830a
+ARG FONDANT_VERSION=51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 9310e76d6..29840d667 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:797f1a81694e4d66c4fe39edbbc9fc2dafce830a
+image: ghcr.io/ml6team/load_from_hf_hub:51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 34b9b5ee5..a1f0c728e 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:797f1a81694e4d66c4fe39edbbc9fc2dafce830a
+image: ghcr.io/ml6team/load_from_hf_hub:51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42
 
 produces:
   image:
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 4a52aa540..d36412ad4 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -200,6 +200,8 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
 
         dataframe.index = dataframe.index.rename("id").astype("string")
 
+        dataframe.visualize(filename=f'dataframe.png')
+
         # Turn index into an empty dataframe so we can write it
         index_df = dataframe.index.to_frame().drop(columns=["id"])
         write_index_task = self._write_subset(

From eefd05a7ba5ace9f10bfe3655effbbd9c59912ce Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 15:06:34 +0200
Subject: [PATCH 19/65] More improvements

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index a3637ad4b..d95a42f67 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42
+ARG FONDANT_VERSION=feac9c142c322f1c00e30c4c0f7052dfa6bf3c92
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 29840d667..0e8fbbdb9 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42
+image: ghcr.io/ml6team/load_from_hf_hub:feac9c142c322f1c00e30c4c0f7052dfa6bf3c92
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index a1f0c728e..a57a0da2c 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:51bc0b8d7e1290dd9bdc3b4016f0ece88cc07a42
+image: ghcr.io/ml6team/load_from_hf_hub:feac9c142c322f1c00e30c4c0f7052dfa6bf3c92
 
 produces:
   image:

From 88dec539e1d03ddbbba46bd920e1e85fe5b3eb56 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 15:33:44 +0200
Subject: [PATCH 20/65] Fix visualization

---
 examples/pipelines/datacomp/pipeline.py | 6 ++----
 src/fondant/data_io.py                  | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index ae14144d5..a22a42479 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,10 +39,8 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 100000,
+        "n_rows_to_load": 10000,
         "dataset_length": 12800000,
-        # "n_rows_to_load": 500000,
-        # "dataset_length": 12800000,
     },
     node_pool_name="n2-standard-128-pool",
 )
@@ -69,7 +67,7 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index d36412ad4..771af8d53 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -200,7 +200,7 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
 
         dataframe.index = dataframe.index.rename("id").astype("string")
 
-        dataframe.visualize(filename=f'dataframe.png')
+        dataframe.visualize(filename=f'{self.manifest.base_path}/graph.png')
 
         # Turn index into an empty dataframe so we can write it
         index_df = dataframe.index.to_frame().drop(columns=["id"])

From a1bcb50358e9bfdbf52f62db7b097cbdd971a493 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 15:58:46 +0200
Subject: [PATCH 21/65] Remove line

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 components/load_from_hf_hub/src/main.py                         | 1 -
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 examples/pipelines/datacomp/pipeline.py                         | 2 +-
 5 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index d95a42f67..7f4286878 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=feac9c142c322f1c00e30c4c0f7052dfa6bf3c92
+ARG FONDANT_VERSION=42a948b606e0c84f1e042e52d207a820a5df48d2
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 0e8fbbdb9..7c7ab69eb 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:feac9c142c322f1c00e30c4c0f7052dfa6bf3c92
+image: ghcr.io/ml6team/load_from_hf_hub:42a948b606e0c84f1e042e52d207a820a5df48d2
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index 8fe5266b8..d4bacf1ba 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -61,7 +61,6 @@ def load(self) -> dd.DataFrame:
             partition_length = self.dataset_length // dask_df.npartitions
             npartitions = self.n_rows_to_load // partition_length
             dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions)
-            dask_df = dd.from_pandas(dask_df, npartitions=1)
 
         return dask_df
 
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index a57a0da2c..6807bfeb7 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:feac9c142c322f1c00e30c4c0f7052dfa6bf3c92
+image: ghcr.io/ml6team/load_from_hf_hub:42a948b606e0c84f1e042e52d207a820a5df48d2
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index a22a42479..393030241 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -67,7 +67,7 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops

From 4ddca85426e231edc71d18dc713e3fb4e3b3c6e8 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 16:46:32 +0200
Subject: [PATCH 22/65] More improvements

---
 components/load_from_hf_hub/src/main.py | 1 +
 examples/pipelines/datacomp/pipeline.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index d4bacf1ba..8e3151ea1 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -61,6 +61,7 @@ def load(self) -> dd.DataFrame:
             partition_length = self.dataset_length // dask_df.npartitions
             npartitions = self.n_rows_to_load // partition_length
             dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions)
+            dask_df = dd.from_pandas(dask_df, npartitions=npartitions).reset_index(drop=True)
 
         return dask_df
 
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 393030241..0017e720e 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,10 +39,11 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 10000,
+        "n_rows_to_load": 100000,
         "dataset_length": 12800000,
     },
     node_pool_name="n2-standard-128-pool",
+    output_partition_size="10MB",
 )
 filter_image_resolution_op = ComponentOp.from_registry(
     name="filter_image_resolution",
@@ -67,7 +68,7 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops

From 30fcfe31eca1ed5d58afd49499a59b9c18274965 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 26 Jul 2023 17:23:38 +0200
Subject: [PATCH 23/65] Add print statements

---
 components/filter_image_resolution/Dockerfile         |  5 +++--
 .../filter_image_resolution/fondant_component.yaml    |  2 +-
 components/filter_image_resolution/src/main.py        |  4 ++++
 examples/pipelines/datacomp/pipeline.py               |  2 ++
 src/fondant/data_io.py                                | 11 +++++++++++
 5 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile
index bc16d1c64..3817dfb39 100644
--- a/components/filter_image_resolution/Dockerfile
+++ b/components/filter_image_resolution/Dockerfile
@@ -3,7 +3,8 @@ FROM --platform=linux/amd64 python:3.8-slim
 # System dependencies
 RUN apt-get update && \
     apt-get upgrade -y && \
-    apt-get install git -y
+    apt-get install git -y && \
+    apt-get install graphviz -y
 
 # Install requirements
 COPY requirements.txt /
@@ -11,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=41bd8f9d8f8003b41ffa375c1887869208e519ea
+ARG FONDANT_VERSION=1aaf6068e7135b917e5e5cb16b3a2bf9d58f4a4e
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml
index 096bfb730..442dbe079 100644
--- a/components/filter_image_resolution/fondant_component.yaml
+++ b/components/filter_image_resolution/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Filter image resolution
 description: Component that filters images based on minimum size and max aspect ratio
-image: ghcr.io/ml6team/filter_image_resolution:41bd8f9d8f8003b41ffa375c1887869208e519ea
+image: ghcr.io/ml6team/filter_image_resolution:1aaf6068e7135b917e5e5cb16b3a2bf9d58f4a4e
 
 consumes:
   image:
diff --git a/components/filter_image_resolution/src/main.py b/components/filter_image_resolution/src/main.py
index c6e0276c0..3ab422ca8 100644
--- a/components/filter_image_resolution/src/main.py
+++ b/components/filter_image_resolution/src/main.py
@@ -24,6 +24,10 @@ def __init__(self, *_, min_image_dim: int, max_aspect_ratio: float) -> None:
         self.max_aspect_ratio = max_aspect_ratio
 
     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+        
+        print("Length of dataframe:", len(dataframe))
+        print("First rows of dataframe:", dataframe.head())
+        
         width = dataframe["image"]["width"]
         height = dataframe["image"]["height"]
         min_image_dim = np.minimum(width, height)
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 0017e720e..4f38c485f 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -48,6 +48,8 @@
 filter_image_resolution_op = ComponentOp.from_registry(
     name="filter_image_resolution",
     arguments={"min_image_dim": 200, "max_aspect_ratio": 3},
+    node_pool_name="n2-standard-128-pool",
+    output_partition_size='disable',
 )
 filter_complexity_op = ComponentOp(
     component_dir="components/filter_text_complexity",
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 771af8d53..ea5318d52 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -93,6 +93,9 @@ def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame:
 
         subset_df = dd.read_parquet(remote_path, columns=fields)
 
+        logger.info(f"First few rows of subset {subset_name}:")
+        print(subset_df.head())
+
         # add subset prefix to columns
         subset_df = subset_df.rename(
             columns={col: subset_name + "_" + col for col in subset_df.columns},
@@ -125,7 +128,12 @@ def load_dataframe(self) -> dd.DataFrame:
                 as well as the index columns.
         """
         # load index into dataframe
+        logging.info(f"Loading index...")
         dataframe = self._load_index()
+
+        logger.info(f"First few rows of index dataframe:")
+        print(dataframe.head())
+
         for name, subset in self.component_spec.consumes.items():
             fields = list(subset.fields.keys())
             subset_df = self._load_subset(name, fields)
@@ -140,6 +148,9 @@ def load_dataframe(self) -> dd.DataFrame:
 
         dataframe = self.partition_loaded_dataframe(dataframe)
 
+        logger.info(f"First few rows of final dataframe provided to the user:")
+        print(dataframe.head())
+
         logging.info(f"Columns of dataframe: {list(dataframe.columns)}")
 
         return dataframe

From 887f48fcc777d281d1284f3a0887650b68a9187a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 27 Jul 2023 12:34:37 +0200
Subject: [PATCH 24/65] More improvements

---
 components/filter_image_resolution/Dockerfile             | 2 +-
 components/filter_image_resolution/fondant_component.yaml | 2 +-
 components/load_from_hf_hub/fondant_component.yaml        | 2 +-
 components/load_from_hf_hub/src/main.py                   | 8 +++++++-
 .../components/load_from_hf_hub/fondant_component.yaml    | 2 +-
 examples/pipelines/datacomp/pipeline.py                   | 2 +-
 6 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile
index 3817dfb39..f51015301 100644
--- a/components/filter_image_resolution/Dockerfile
+++ b/components/filter_image_resolution/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=1aaf6068e7135b917e5e5cb16b3a2bf9d58f4a4e
+ARG FONDANT_VERSION=f18dd510b1f106f421d433eb02b512177d5116a3
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml
index 442dbe079..4f2607bc9 100644
--- a/components/filter_image_resolution/fondant_component.yaml
+++ b/components/filter_image_resolution/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Filter image resolution
 description: Component that filters images based on minimum size and max aspect ratio
-image: ghcr.io/ml6team/filter_image_resolution:1aaf6068e7135b917e5e5cb16b3a2bf9d58f4a4e
+image: ghcr.io/ml6team/filter_image_resolution:f18dd510b1f106f421d433eb02b512177d5116a3
 
 consumes:
   image:
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 7c7ab69eb..db6da0b01 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:42a948b606e0c84f1e042e52d207a820a5df48d2
+image: ghcr.io/ml6team/load_from_hf_hub:f18dd510b1f106f421d433eb02b512177d5116a3
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index 8e3151ea1..686a282b4 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -61,7 +61,13 @@ def load(self) -> dd.DataFrame:
             partition_length = self.dataset_length // dask_df.npartitions
             npartitions = self.n_rows_to_load // partition_length
             dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions)
-            dask_df = dd.from_pandas(dask_df, npartitions=npartitions).reset_index(drop=True)
+            dask_df = dd.from_pandas(dask_df, npartitions=npartitions)
+            # .reset_index(drop=True) # will reset it from 0 for every partition
+
+        # Set monotonically increasing index
+        dask_df["id"] = 1
+        dask_df["id"] = dask_df.id.cumsum()
+        dask_df = dask_df.set_index("id", sort=True)
 
         return dask_df
 
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 6807bfeb7..9aa9bdb35 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:42a948b606e0c84f1e042e52d207a820a5df48d2
+image: ghcr.io/ml6team/load_from_hf_hub:f18dd510b1f106f421d433eb02b512177d5116a3
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 4f38c485f..6b3645557 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -70,7 +70,7 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops

From ab176415315cb8a42119469835dad4d924a5c9d8 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 27 Jul 2023 13:13:34 +0200
Subject: [PATCH 25/65] More improvements

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 examples/pipelines/datacomp/pipeline.py                         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 7f4286878..c401acdb4 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=42a948b606e0c84f1e042e52d207a820a5df48d2
+ARG FONDANT_VERSION=fdd0bdac524845a9e0a359916c5a54a9b7518f1d
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index db6da0b01..a99d8ff09 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:f18dd510b1f106f421d433eb02b512177d5116a3
+image: ghcr.io/ml6team/load_from_hf_hub:fdd0bdac524845a9e0a359916c5a54a9b7518f1d
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 9aa9bdb35..20ca7e2ee 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:f18dd510b1f106f421d433eb02b512177d5116a3
+image: ghcr.io/ml6team/load_from_hf_hub:fdd0bdac524845a9e0a359916c5a54a9b7518f1d
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 6b3645557..4f38c485f 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -70,7 +70,7 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops

From 12ba25f625504e947ab6b625fbfe485077d620fb Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 27 Jul 2023 13:33:07 +0200
Subject: [PATCH 26/65] Comment out code

---
 examples/pipelines/datacomp/pipeline.py |  4 ++--
 src/fondant/data_io.py                  | 17 +++++++----------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 4f38c485f..cbf28fa0d 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,11 +39,11 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 100000,
+        "n_rows_to_load": 500000,
         "dataset_length": 12800000,
     },
     node_pool_name="n2-standard-128-pool",
-    output_partition_size="10MB",
+    # output_partition_size="10MB",
 )
 filter_image_resolution_op = ComponentOp.from_registry(
     name="filter_image_resolution",
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index ea5318d52..818d7da53 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -198,20 +198,22 @@ def partition_written_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame:
                     msg,
                 )
 
+        logger.info(f"Dataframe number of partitions is {dataframe.npartitions}")
+        
         return dataframe
 
     def write_dataframe(self, dataframe: dd.DataFrame) -> None:
         write_tasks = []
 
         dataframe = self.partition_written_dataframe(dataframe)
-        
-        logger.info(f"Dataframe number of partitions is {dataframe.npartitions}")
-
-        logger.info("Creating write tasks...")
 
         dataframe.index = dataframe.index.rename("id").astype("string")
 
-        dataframe.visualize(filename=f'{self.manifest.base_path}/graph.png')
+        # logging.info("Visualizing task graph...")
+        # TODO: doesn't work on GCP
+        # dataframe.visualize(filename=f'{self.manifest.base_path}/graph.png')
+
+        logger.info("Creating write tasks...")
 
         # Turn index into an empty dataframe so we can write it
         index_df = dataframe.index.to_frame().drop(columns=["id"])
@@ -237,11 +239,6 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
 
         with ProgressBar():
             logging.info("Writing data...")
-            # visualize the low level Dask graph
-            logging.info("Visualizing task graph...")
-            dask.visualize(*write_tasks, filename='task_graph.png')
-            for i, task in enumerate(write_tasks):
-                task.visualize(filename=f'task_{i}.svg')
             dd.compute(*write_tasks)
 
     @staticmethod

From 6e1d318f9fed54a4ebccc3abe99a11604cc98018 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 27 Jul 2023 14:18:44 +0200
Subject: [PATCH 27/65] More improvements

---
 components/filter_image_resolution/Dockerfile                 | 2 +-
 components/filter_image_resolution/fondant_component.yaml     | 2 +-
 components/load_from_hf_hub/Dockerfile                        | 2 +-
 components/load_from_hf_hub/fondant_component.yaml            | 2 +-
 components/load_from_hf_hub/src/main.py                       | 1 +
 .../components/load_from_hf_hub/fondant_component.yaml        | 2 +-
 examples/pipelines/datacomp/pipeline.py                       | 4 ++--
 7 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile
index f51015301..af0d837c4 100644
--- a/components/filter_image_resolution/Dockerfile
+++ b/components/filter_image_resolution/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=f18dd510b1f106f421d433eb02b512177d5116a3
+ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml
index 4f2607bc9..e441cac78 100644
--- a/components/filter_image_resolution/fondant_component.yaml
+++ b/components/filter_image_resolution/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Filter image resolution
 description: Component that filters images based on minimum size and max aspect ratio
-image: ghcr.io/ml6team/filter_image_resolution:f18dd510b1f106f421d433eb02b512177d5116a3
+image: ghcr.io/ml6team/filter_image_resolution:28ec87862c160ead773eb15b57905ac61515f8cf
 
 consumes:
   image:
diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index c401acdb4..af0d837c4 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=fdd0bdac524845a9e0a359916c5a54a9b7518f1d
+ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index a99d8ff09..afe3736fa 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:fdd0bdac524845a9e0a359916c5a54a9b7518f1d
+image: ghcr.io/ml6team/load_from_hf_hub:28ec87862c160ead773eb15b57905ac61515f8cf
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index 686a282b4..2f4b2f9f7 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -65,6 +65,7 @@ def load(self) -> dd.DataFrame:
             # .reset_index(drop=True) # will reset it from 0 for every partition
 
         # Set monotonically increasing index
+        logger.info("Setting the index...")
         dask_df["id"] = 1
         dask_df["id"] = dask_df.id.cumsum()
         dask_df = dask_df.set_index("id", sort=True)
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 20ca7e2ee..8015adf5b 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:fdd0bdac524845a9e0a359916c5a54a9b7518f1d
+image: ghcr.io/ml6team/load_from_hf_hub:28ec87862c160ead773eb15b57905ac61515f8cf
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index cbf28fa0d..73edc0e05 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 

From ef5c323a2768d57fdb892c1426aeef8a075094f9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 27 Jul 2023 14:21:02 +0200
Subject: [PATCH 28/65] Remove print statements

---
 components/filter_image_resolution/src/main.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/components/filter_image_resolution/src/main.py b/components/filter_image_resolution/src/main.py
index 3ab422ca8..c6e0276c0 100644
--- a/components/filter_image_resolution/src/main.py
+++ b/components/filter_image_resolution/src/main.py
@@ -24,10 +24,6 @@ def __init__(self, *_, min_image_dim: int, max_aspect_ratio: float) -> None:
         self.max_aspect_ratio = max_aspect_ratio
 
     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
-        
-        print("Length of dataframe:", len(dataframe))
-        print("First rows of dataframe:", dataframe.head())
-        
         width = dataframe["image"]["width"]
         height = dataframe["image"]["height"]
         min_image_dim = np.minimum(width, height)

From cfba11abbd808cd602dfc894c2be5b8fd63486ba Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 28 Jul 2023 09:38:40 +0200
Subject: [PATCH 29/65] Fix repartioning

---
 .../components/filter_text_complexity/Dockerfile         | 2 +-
 .../filter_text_complexity/fondant_component.yaml        | 2 +-
 examples/pipelines/datacomp/pipeline.py                  | 2 +-
 src/fondant/data_io.py                                   | 9 +++------
 4 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
index 610851a01..b1c6fe14c 100644
--- a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
+++ b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
@@ -12,7 +12,7 @@ RUN python -m spacy download en_core_web_sm
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=79df895e9d62d2010ccb8d40ee7e4fd4c68f117d
+ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml
index 7513d5ebf..dfb43a930 100644
--- a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Filter text complexity
 description: Component that filters text based on their dependency parse complexity and number of actions
-image: ghcr.io/ml6team/filter_text_complexity:latest
+image: ghcr.io/ml6team/filter_text_complexity:28ec87862c160ead773eb15b57905ac61515f8cf
 
 consumes:
   text:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 73edc0e05..18f8414b1 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -71,7 +71,7 @@
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
 pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
-# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
+pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 818d7da53..27c9c4036 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -131,13 +131,13 @@ def load_dataframe(self) -> dd.DataFrame:
         logging.info(f"Loading index...")
         dataframe = self._load_index()
 
-        logger.info(f"First few rows of index dataframe:")
-        print(dataframe.head())
-
         for name, subset in self.component_spec.consumes.items():
             fields = list(subset.fields.keys())
             subset_df = self._load_subset(name, fields)
             # left joins -> filter on index
+            # make sure that dataframe has same number of partitions
+            # as subset
+            dataframe = dataframe.repartition(npartitions=subset_df.npartitions)
             dataframe = dd.merge(
                 dataframe,
                 subset_df,
@@ -148,9 +148,6 @@ def load_dataframe(self) -> dd.DataFrame:
 
         dataframe = self.partition_loaded_dataframe(dataframe)
 
-        logger.info(f"First few rows of final dataframe provided to the user:")
-        print(dataframe.head())
-
         logging.info(f"Columns of dataframe: {list(dataframe.columns)}")
 
         return dataframe

From 014543ba0cbc3128ab4e1cd230dfce6d15efd20d Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 28 Jul 2023 10:55:52 +0200
Subject: [PATCH 30/65] More improvements

---
 components/filter_image_resolution/Dockerfile                 | 2 +-
 components/filter_image_resolution/fondant_component.yaml     | 2 +-
 components/load_from_hf_hub/Dockerfile                        | 2 +-
 components/load_from_hf_hub/fondant_component.yaml            | 2 +-
 .../datacomp/components/filter_text_complexity/Dockerfile     | 2 +-
 .../components/filter_text_complexity/fondant_component.yaml  | 2 +-
 examples/pipelines/datacomp/pipeline.py                       | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile
index af0d837c4..177c9bf15 100644
--- a/components/filter_image_resolution/Dockerfile
+++ b/components/filter_image_resolution/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf
+ARG FONDANT_VERSION=f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/filter_image_resolution/fondant_component.yaml b/components/filter_image_resolution/fondant_component.yaml
index e441cac78..41226e2cb 100644
--- a/components/filter_image_resolution/fondant_component.yaml
+++ b/components/filter_image_resolution/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Filter image resolution
 description: Component that filters images based on minimum size and max aspect ratio
-image: ghcr.io/ml6team/filter_image_resolution:28ec87862c160ead773eb15b57905ac61515f8cf
+image: ghcr.io/ml6team/filter_image_resolution:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
 
 consumes:
   image:
diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index af0d837c4..177c9bf15 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf
+ARG FONDANT_VERSION=f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index afe3736fa..01f8022f5 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:28ec87862c160ead773eb15b57905ac61515f8cf
+image: ghcr.io/ml6team/load_from_hf_hub:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
index b1c6fe14c..c7ede3184 100644
--- a/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
+++ b/examples/pipelines/datacomp/components/filter_text_complexity/Dockerfile
@@ -12,7 +12,7 @@ RUN python -m spacy download en_core_web_sm
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=28ec87862c160ead773eb15b57905ac61515f8cf
+ARG FONDANT_VERSION=f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml
index dfb43a930..143032b82 100644
--- a/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/filter_text_complexity/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Filter text complexity
 description: Component that filters text based on their dependency parse complexity and number of actions
-image: ghcr.io/ml6team/filter_text_complexity:28ec87862c160ead773eb15b57905ac61515f8cf
+image: ghcr.io/ml6team/filter_text_complexity:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
 
 consumes:
   text:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 18f8414b1..c6d771bc5 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 

From b63c5cbd0b4028e28355a55ad8f7d833a5e64fb9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 28 Jul 2023 11:22:31 +0200
Subject: [PATCH 31/65] More improvements

---
 .../components/load_from_hf_hub/fondant_component.yaml        | 2 +-
 examples/pipelines/datacomp/pipeline.py                       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 8015adf5b..87e799c3f 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:28ec87862c160ead773eb15b57905ac61515f8cf
+image: ghcr.io/ml6team/load_from_hf_hub:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index c6d771bc5..18f8414b1 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 

From ce67179cd3674bf606e58ed8e2fb20b50975a0d3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 1 Aug 2023 10:45:44 +0200
Subject: [PATCH 32/65] Add download images component

---
 .../components/download_images/Dockerfile     |  23 ++
 .../components/download_images/README.md      |  12 +
 .../download_images/fondant_component.yaml    |  49 ++++
 .../download_images/requirements.txt          |   2 +
 .../components/download_images/src/main.py    | 159 +++++++++++
 .../components/download_images/src/resizer.py | 258 ++++++++++++++++++
 examples/pipelines/datacomp/pipeline.py       |   8 +-
 7 files changed, 509 insertions(+), 2 deletions(-)
 create mode 100644 examples/pipelines/datacomp/components/download_images/Dockerfile
 create mode 100644 examples/pipelines/datacomp/components/download_images/README.md
 create mode 100644 examples/pipelines/datacomp/components/download_images/fondant_component.yaml
 create mode 100644 examples/pipelines/datacomp/components/download_images/requirements.txt
 create mode 100644 examples/pipelines/datacomp/components/download_images/src/main.py
 create mode 100644 examples/pipelines/datacomp/components/download_images/src/resizer.py

diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile
new file mode 100644
index 000000000..abfa9a414
--- /dev/null
+++ b/examples/pipelines/datacomp/components/download_images/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=main
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/download_images/README.md b/examples/pipelines/datacomp/components/download_images/README.md
new file mode 100644
index 000000000..f9fb8b866
--- /dev/null
+++ b/examples/pipelines/datacomp/components/download_images/README.md
@@ -0,0 +1,12 @@
+# download_images
+
+### Description
+This component takes in image URLs as input and downloads the images, along with some metadata (like their height and width).
+The images are stored in a new colum as bytes objects. This component also resizes the images using the [resizer](https://github.com/rom1504/img2dataset/blob/main/img2dataset/resizer.py) function from the img2dataset library.
+
+If the component is unable to retrieve the image at a URL (for any reason), it will return `None` for that particular URL.
+
+### **Inputs/Outputs**
+
+See [`fondant_component.yaml`](fondant_component.yaml) for a more detailed description on all the input/output parameters. 
+
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
new file mode 100644
index 000000000..f1e089777
--- /dev/null
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -0,0 +1,49 @@
+name: Download images
+description: Component that downloads images based on URLs
+image: ghcr.io/ml6team/download_images:dev
+
+consumes:
+  image:
+    fields:
+      url:
+        type: string
+
+produces:
+  image:
+    fields:
+      data:
+        type: binary
+      width:
+        type: int16
+      height:
+        type: int16
+
+args:
+  timeout:
+    description: Maximum time (in seconds) to wait when trying to download an image
+    type: int
+    default: 10
+  retries:
+    description: Number of times to retry downloading an image if it fails.
+    type: int
+    default: 0
+  image_size:
+    description: Size of the images after resizing.
+    type: int
+    default: 256
+  resize_mode:
+    description: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border".
+    type: str
+    default: 'border'
+  resize_only_if_bigger: 
+    description: If True, resize only if image is bigger than image_size.
+    type: bool
+    default: 'False'
+  min_image_size:
+    description: Minimum size of the images.
+    type: int
+    default: 0
+  max_aspect_ratio:
+    description: Maximum aspect ratio of the images.
+    type: float
+    default: 'inf'
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/download_images/requirements.txt b/examples/pipelines/datacomp/components/download_images/requirements.txt
new file mode 100644
index 000000000..de9d1dfe3
--- /dev/null
+++ b/examples/pipelines/datacomp/components/download_images/requirements.txt
@@ -0,0 +1,2 @@
+albumentations==1.3.0
+opencv-python-headless>=4.5.5.62,<5
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
new file mode 100644
index 000000000..64d04e967
--- /dev/null
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -0,0 +1,159 @@
+"""
+This component downloads images based on URLs, and resizes them based on various settings like
+minimum image size and aspect ratio.
+
+Some functions here are directly taken from
+https://github.com/rom1504/img2dataset/blob/main/img2dataset/downloader.py.
+"""
+import io
+import logging
+import traceback
+import urllib
+
+import dask.dataframe as dd
+from fondant.component import DaskTransformComponent
+from fondant.executor import DaskTransformExecutor
+from resizer import Resizer
+
+logger = logging.getLogger(__name__)
+
+
+def is_disallowed(headers, user_agent_token, disallowed_header_directives):
+    """Check if HTTP headers contain an X-Robots-Tag directive disallowing usage."""
+    for values in headers.get_all("X-Robots-Tag", []):
+        try:
+            uatoken_directives = values.split(":", 1)
+            directives = [x.strip().lower() for x in uatoken_directives[-1].split(",")]
+            ua_token = (
+                uatoken_directives[0].lower() if len(uatoken_directives) == 2  # noqa: PLR2004
+                else None
+            )
+            if (ua_token is None or ua_token == user_agent_token) and any(
+                    x in disallowed_header_directives for x in directives
+            ):
+                return True
+        except Exception as err:
+            traceback.print_exc()
+            print(f"Failed to parse X-Robots-Tag: {values}: {err}")
+    return False
+
+
+def download_image(url, timeout, user_agent_token, disallowed_header_directives):
+    """Download an image with urllib."""
+    img_stream = None
+    user_agent_string = (
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
+    )
+    if user_agent_token:
+        user_agent_string += f" (compatible; {user_agent_token}; " \
+                             f"+https://github.com/rom1504/img2dataset)"
+    try:
+        request = urllib.request.Request(
+            url, data=None, headers={"User-Agent": user_agent_string},
+        )
+        with urllib.request.urlopen(request, timeout=timeout) as r:
+            if disallowed_header_directives and is_disallowed(
+                    r.headers,
+                    user_agent_token,
+                    disallowed_header_directives,
+            ):
+                return None
+            img_stream = io.BytesIO(r.read())
+        return img_stream
+    except Exception:
+        if img_stream is not None:
+            img_stream.close()
+        return None
+
+
+def download_image_with_retry(
+        url,
+        *,
+        timeout,
+        retries,
+        resizer,
+        user_agent_token=None,
+        disallowed_header_directives=None,
+):
+    for _ in range(retries + 1):
+        img_stream = download_image(
+            url, timeout, user_agent_token, disallowed_header_directives,
+        )
+        if img_stream is not None:
+            # resize the image
+            img_str, width, height = resizer(img_stream)
+            return img_str, width, height
+    return None, None, None
+
+
+class DownloadImagesComponent(DaskTransformComponent):
+    """Component that downloads images based on URLs."""
+
+    def __init__(self,
+                 *_,
+                 timeout: int,
+                 retries: int,
+                 image_size: int,
+                 resize_mode: str,
+                 resize_only_if_bigger: bool,
+                 min_image_size: int,
+                 max_aspect_ratio: float,
+                 ):
+        """Component that downloads images from a list of URLs and executes filtering and resizing.
+
+        Args:
+            timeout: Maximum time (in seconds) to wait when trying to download an image.
+            retries: Number of times to retry downloading an image if it fails.
+            image_size: Size of the images after resizing.
+            resize_mode: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border".
+            resize_only_if_bigger: If True, resize only if image is bigger than image_size.
+            min_image_size: Minimum size of the images.
+            max_aspect_ratio: Maximum aspect ratio of the images.
+
+        Returns:
+            Dask dataframe
+        """
+        self.timeout = timeout
+        self.retries = retries
+        self.resizer = Resizer(
+            image_size=image_size,
+            resize_mode=resize_mode,
+            resize_only_if_bigger=resize_only_if_bigger,
+            min_image_size=min_image_size,
+            max_aspect_ratio=max_aspect_ratio,
+        )
+
+    def transform(
+            self,
+            dataframe: dd.DataFrame,
+    ) -> dd.DataFrame:
+        logger.info("Instantiating resizer...")
+
+        # Remove duplicates from laion retrieval
+        dataframe = dataframe.drop_duplicates()
+
+        result = dataframe.apply(
+            lambda example: download_image_with_retry(
+                url=example.images_url,
+                timeout=self.timeout,
+                retries=self.retries,
+                resizer=self.resizer,
+            ),
+            axis=1,
+            result_type="expand",
+            meta={0: bytes, 1: int, 2: int},
+        )
+
+        result.columns = [("image", "data"), ("image", "width"), ("image", "height")]
+
+        dataframe = dataframe.merge(result, left_index=True, right_index=True)
+
+        # Remove images that could not be fetched
+        dataframe = dataframe.dropna()
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    executor = DaskTransformExecutor.from_args()
+    executor.execute(DownloadImagesComponent)
diff --git a/examples/pipelines/datacomp/components/download_images/src/resizer.py b/examples/pipelines/datacomp/components/download_images/src/resizer.py
new file mode 100644
index 000000000..f545a0bf1
--- /dev/null
+++ b/examples/pipelines/datacomp/components/download_images/src/resizer.py
@@ -0,0 +1,258 @@
+"""resizer module handle image resizing.
+
+Source: https://github.com/rom1504/img2dataset/blob/main/img2dataset/resizer.py.
+
+MIT License
+
+Copyright (c) 2021 Romain Beaumont
+"""
+# ruff: noqa
+
+import albumentations as A
+import cv2
+import numpy as np
+from enum import Enum
+import imghdr
+import os
+
+_INTER_STR_TO_CV2 = dict(
+    nearest=cv2.INTER_NEAREST,
+    linear=cv2.INTER_LINEAR,
+    bilinear=cv2.INTER_LINEAR,
+    cubic=cv2.INTER_CUBIC,
+    bicubic=cv2.INTER_CUBIC,
+    area=cv2.INTER_AREA,
+    lanczos=cv2.INTER_LANCZOS4,
+    lanczos4=cv2.INTER_LANCZOS4,
+)
+
+
+class ResizeMode(Enum):
+    no = 0  # pylint: disable=invalid-name
+    keep_ratio = 1  # pylint: disable=invalid-name
+    center_crop = 2  # pylint: disable=invalid-name
+    border = 3  # pylint: disable=invalid-name
+    keep_ratio_largest = 4  # pylint: disable=invalid-name
+
+
+# thanks https://stackoverflow.com/questions/11130156/suppress-stdout-stderr-print-from-python-functions
+class SuppressStdoutStderr:
+    """
+    A context manager for doing a "deep suppression" of stdout and stderr in
+    Python, i.e. will suppress all print, even if the print originates in a
+    compiled C/Fortran sub-function.
+       This will not suppress raised exceptions, since exceptions are printed
+    to stderr just before a script exits, and after the context manager has
+    exited (at least, I think that is why it lets exceptions through).
+
+    """
+
+    def __init__(self):
+        # Open a pair of null files
+        self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
+        # Save the actual stdout (1) and stderr (2) file descriptors.
+        self.save_fds = [os.dup(1), os.dup(2)]
+
+    def __enter__(self):
+        # Assign the null pointers to stdout and stderr.
+        os.dup2(self.null_fds[0], 1)
+        os.dup2(self.null_fds[1], 2)
+
+    def __exit__(self, *_):
+        # Re-assign the real stdout/stderr back to (1) and (2)
+        os.dup2(self.save_fds[0], 1)
+        os.dup2(self.save_fds[1], 2)
+        # Close all file descriptors
+        for fd in self.null_fds + self.save_fds:
+            os.close(fd)
+
+
+def inter_str_to_cv2(inter_str):
+    inter_str = inter_str.lower()
+    if inter_str not in _INTER_STR_TO_CV2:
+        raise Exception(f"Invalid option for interpolation: {inter_str}")
+    return _INTER_STR_TO_CV2[inter_str]
+
+
+class Resizer:
+    """
+    Resize images
+    Expose a __call__ method to be used as a callable object
+
+    Should be used to resize one image at a time
+
+    Options:
+        resize_mode: "no", "keep_ratio", "center_crop", "border"
+        resize_only_if_bigger: if True, resize only if image is bigger than image_size
+        image_size: size of the output image to resize
+    """
+
+    def __init__(
+        self,
+        image_size,
+        resize_mode,
+        resize_only_if_bigger,
+        upscale_interpolation="lanczos",
+        downscale_interpolation="area",
+        encode_quality=95,
+        encode_format="jpg",
+        skip_reencode=False,
+        disable_all_reencoding=False,
+        min_image_size=0,
+        max_image_area=float("inf"),
+        max_aspect_ratio=float("inf"),
+        blurrer=None,
+    ):
+        if encode_format not in ["jpg", "png", "webp"]:
+            raise ValueError(f"Invalid encode format {encode_format}")
+        if encode_format == "png":
+            if encode_quality < 0 or encode_quality > 9:
+                raise ValueError(
+                    "For png, encode quality represents compression which"
+                    f"must be between 0 and 9, got {encode_quality}"
+                )
+
+        self.image_size = image_size
+        if isinstance(resize_mode, str):
+            if (
+                resize_mode not in ResizeMode.__members__
+            ):  # pylint: disable=unsupported-membership-test
+                raise Exception(f"Invalid option for resize_mode: {resize_mode}")
+            resize_mode = ResizeMode[resize_mode]
+        self.resize_mode = resize_mode
+        self.resize_only_if_bigger = resize_only_if_bigger
+        self.upscale_interpolation = inter_str_to_cv2(upscale_interpolation)
+        self.downscale_interpolation = inter_str_to_cv2(downscale_interpolation)
+        self.encode_format = encode_format
+        cv2_img_quality = None
+        if encode_format == "jpg":
+            cv2_img_quality = int(cv2.IMWRITE_JPEG_QUALITY)
+            self.what_ext = "jpeg"
+        elif encode_format == "png":
+            cv2_img_quality = int(cv2.IMWRITE_PNG_COMPRESSION)
+            self.what_ext = "png"
+        elif encode_format == "webp":
+            cv2_img_quality = int(cv2.IMWRITE_WEBP_QUALITY)
+            self.what_ext = "webp"
+        if cv2_img_quality is None:
+            raise Exception(f"Invalid option for encode_format: {encode_format}")
+        self.encode_params = [cv2_img_quality, encode_quality]
+        self.skip_reencode = skip_reencode
+        self.disable_all_reencoding = disable_all_reencoding
+        self.min_image_size = min_image_size
+        self.max_image_area = max_image_area
+        self.max_aspect_ratio = max_aspect_ratio
+        self.blurrer = blurrer
+
+    def __call__(self, img_stream, blurring_bbox_list=None):
+        """
+        input: an image stream, optionally a list of bounding boxes to blur.
+        output: img_str, width, height
+        """
+        try:
+            if self.disable_all_reencoding:
+                return img_stream.read(), None, None, None, None, None
+            with SuppressStdoutStderr():
+                cv2.setNumThreads(1)
+                img_stream.seek(0)
+                encode_needed = (
+                    imghdr.what(img_stream) != self.what_ext
+                    if self.skip_reencode
+                    else True
+                )
+                img_stream.seek(0)
+                img_buf = np.frombuffer(img_stream.read(), np.uint8)
+                img = cv2.imdecode(img_buf, cv2.IMREAD_UNCHANGED)
+                if img is None:
+                    raise Exception("Image decoding error")
+                if len(img.shape) == 3 and img.shape[-1] == 4:
+                    # alpha matting with white background
+                    alpha = img[:, :, 3, np.newaxis]
+                    img = alpha / 255 * img[..., :3] + 255 - alpha
+                    img = np.rint(img.clip(min=0, max=255)).astype(np.uint8)
+                    encode_needed = True
+                original_height, original_width = img.shape[:2]
+                # check if image is too small
+                if min(original_height, original_width) < self.min_image_size:
+                    return None, None, None
+                if original_height * original_width > self.max_image_area:
+                    return None, None, None
+                # check if wrong aspect ratio
+                if (
+                    max(original_height, original_width)
+                    / min(original_height, original_width)
+                    > self.max_aspect_ratio
+                ):
+                    return None, None, None
+
+                # check if resizer was defined during init if needed
+                if blurring_bbox_list is not None and self.blurrer is None:
+                    return None, None, None
+
+                # Flag to check if blurring is still needed.
+                maybe_blur_still_needed = True
+
+                # resizing in following conditions
+                if self.resize_mode in (ResizeMode.keep_ratio, ResizeMode.center_crop):
+                    downscale = min(original_width, original_height) > self.image_size
+                    if not self.resize_only_if_bigger or downscale:
+                        interpolation = (
+                            self.downscale_interpolation
+                            if downscale
+                            else self.upscale_interpolation
+                        )
+                        img = A.smallest_max_size(
+                            img, self.image_size, interpolation=interpolation
+                        )
+                        if blurring_bbox_list is not None and self.blurrer is not None:
+                            img = self.blurrer(img=img, bbox_list=blurring_bbox_list)
+                        if self.resize_mode == ResizeMode.center_crop:
+                            img = A.center_crop(img, self.image_size, self.image_size)
+                        encode_needed = True
+                        maybe_blur_still_needed = False
+                elif self.resize_mode in (
+                    ResizeMode.border,
+                    ResizeMode.keep_ratio_largest,
+                ):
+                    downscale = max(original_width, original_height) > self.image_size
+                    if not self.resize_only_if_bigger or downscale:
+                        interpolation = (
+                            self.downscale_interpolation
+                            if downscale
+                            else self.upscale_interpolation
+                        )
+                        img = A.longest_max_size(
+                            img, self.image_size, interpolation=interpolation
+                        )
+                        if blurring_bbox_list is not None and self.blurrer is not None:
+                            img = self.blurrer(img=img, bbox_list=blurring_bbox_list)
+                        if self.resize_mode == ResizeMode.border:
+                            img = A.pad(
+                                img,
+                                self.image_size,
+                                self.image_size,
+                                border_mode=cv2.BORDER_CONSTANT,
+                                value=[255, 255, 255],
+                            )
+                        encode_needed = True
+                        maybe_blur_still_needed = False
+
+                # blur parts of the image if needed
+                if (
+                    maybe_blur_still_needed
+                    and blurring_bbox_list is not None
+                    and self.blurrer is not None
+                ):
+                    img = self.blurrer(img=img, bbox_list=blurring_bbox_list)
+
+                height, width = img.shape[:2]
+                if encode_needed:
+                    img_str = cv2.imencode(
+                        f".{self.encode_format}", img, params=self.encode_params
+                    )[1].tobytes()
+                else:
+                    img_str = img_buf.tobytes()
+                return img_str, width, height
+
+        except Exception as err:  # pylint: disable=broad-except
+            return None, None, None
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 18f8414b1..10022ddc8 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -67,11 +67,15 @@
         "num_clusters": 3,
     },
 )
+download_images_op = ComponentOp(
+    component_dir="components/download_images",
+)
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
-pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
+# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
+pipeline.add_op(download_images_op, dependencies=load_from_hub_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 

From 3d9f1197bd8cbe07c1adf61189e2cc78ad9deb66 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 1 Aug 2023 11:49:26 +0200
Subject: [PATCH 33/65] Update script

---
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 components/load_from_hf_hub/src/main.py       |  5 +++--
 .../components/download_images/Dockerfile     |  2 +-
 .../download_images/fondant_component.yaml    |  2 +-
 .../components/download_images/src/main.py    | 21 ++++++-------------
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 examples/pipelines/datacomp/pipeline.py       |  2 +-
 7 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 01f8022f5..05b65a56e 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
+image: ghcr.io/ml6team/load_from_hf_hub:3c9ea91ff9221286f9a228c61c3cea44e5499a12
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index 2f4b2f9f7..a2103dbbf 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -57,9 +57,10 @@ def load(self) -> dd.DataFrame:
                 raise ValueError("""Make sure to also specify the length of the entire
                                  dataset. This is required as otherwise only the first
                                  partition can be loaded""")
-            logger.info(f"Loading approximately {self.n_rows_to_load} rows...")
+            logger.info(f"""Loading approximately {self.n_rows_to_load} rows...
+                        at least one partition""")
             partition_length = self.dataset_length // dask_df.npartitions
-            npartitions = self.n_rows_to_load // partition_length
+            npartitions = max(self.n_rows_to_load // partition_length, 1)
             dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions)
             dask_df = dd.from_pandas(dask_df, npartitions=npartitions)
             # .reset_index(drop=True) # will reset it from 0 for every partition
diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile
index abfa9a414..dcdb7bb29 100644
--- a/examples/pipelines/datacomp/components/download_images/Dockerfile
+++ b/examples/pipelines/datacomp/components/download_images/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=main
+ARG FONDANT_VERSION=3c9ea91ff9221286f9a228c61c3cea44e5499a12
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index f1e089777..9f800040b 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:dev
+image: ghcr.io/ml6team/download_images:3c9ea91ff9221286f9a228c61c3cea44e5499a12
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index 64d04e967..c6b89e392 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -11,8 +11,8 @@
 import urllib
 
 import dask.dataframe as dd
-from fondant.component import DaskTransformComponent
-from fondant.executor import DaskTransformExecutor
+from fondant.component import PandasTransformComponent
+from fondant.executor import PandasTransformExecutor
 from resizer import Resizer
 
 logger = logging.getLogger(__name__)
@@ -86,7 +86,7 @@ def download_image_with_retry(
     return None, None, None
 
 
-class DownloadImagesComponent(DaskTransformComponent):
+class DownloadImagesComponent(PandasTransformComponent):
     """Component that downloads images based on URLs."""
 
     def __init__(self,
@@ -129,25 +129,16 @@ def transform(
     ) -> dd.DataFrame:
         logger.info("Instantiating resizer...")
 
-        # Remove duplicates from laion retrieval
-        dataframe = dataframe.drop_duplicates()
-
-        result = dataframe.apply(
+        dataframe[[("image", "data"), ("image", "width"), ("image", "height")]] = dataframe.apply(
             lambda example: download_image_with_retry(
-                url=example.images_url,
+                url=example.image.url,
                 timeout=self.timeout,
                 retries=self.retries,
                 resizer=self.resizer,
             ),
             axis=1,
-            result_type="expand",
-            meta={0: bytes, 1: int, 2: int},
         )
 
-        result.columns = [("image", "data"), ("image", "width"), ("image", "height")]
-
-        dataframe = dataframe.merge(result, left_index=True, right_index=True)
-
         # Remove images that could not be fetched
         dataframe = dataframe.dropna()
 
@@ -155,5 +146,5 @@ def transform(
 
 
 if __name__ == "__main__":
-    executor = DaskTransformExecutor.from_args()
+    executor = PandasTransformExecutor.from_args()
     executor.execute(DownloadImagesComponent)
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 87e799c3f..072ae0996 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
+image: ghcr.io/ml6team/load_from_hf_hub:3c9ea91ff9221286f9a228c61c3cea44e5499a12
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 10022ddc8..b43228df9 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,7 +39,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 500000,
+        "n_rows_to_load": 10000,
         "dataset_length": 12800000,
     },
     node_pool_name="n2-standard-128-pool",

From e288749a9f126ee51574b86cd0b3f2b1677f7cd7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 1 Aug 2023 12:06:33 +0200
Subject: [PATCH 34/65] Remove graphviz

---
 components/filter_image_resolution/Dockerfile                 | 3 +--
 components/load_from_hf_hub/Dockerfile                        | 3 +--
 .../components/download_images/fondant_component.yaml         | 2 +-
 pyproject.toml                                                | 1 -
 src/fondant/data_io.py                                        | 4 ----
 5 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/components/filter_image_resolution/Dockerfile b/components/filter_image_resolution/Dockerfile
index 177c9bf15..5db8abca7 100644
--- a/components/filter_image_resolution/Dockerfile
+++ b/components/filter_image_resolution/Dockerfile
@@ -3,8 +3,7 @@ FROM --platform=linux/amd64 python:3.8-slim
 # System dependencies
 RUN apt-get update && \
     apt-get upgrade -y && \
-    apt-get install git -y && \
-    apt-get install graphviz -y
+    apt-get install git -y
 
 # Install requirements
 COPY requirements.txt /
diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 177c9bf15..5db8abca7 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -3,8 +3,7 @@ FROM --platform=linux/amd64 python:3.8-slim
 # System dependencies
 RUN apt-get update && \
     apt-get upgrade -y && \
-    apt-get install git -y && \
-    apt-get install graphviz -y
+    apt-get install git -y
 
 # Install requirements
 COPY requirements.txt /
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 9f800040b..760d5b829 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:3c9ea91ff9221286f9a228c61c3cea44e5499a12
+image: ghcr.io/ml6team/download_images:06b316c830abe72d9d9a71f914c9c5fd205ec88b
 
 consumes:
   image:
diff --git a/pyproject.toml b/pyproject.toml
index b074eb7a2..7b837de9f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,6 @@ dask = {extras = ["dataframe"], version = ">= 2023.4.1"}
 importlib-resources = { version = ">= 1.3", python = "<3.9" }
 jsonschema = ">= 4.18"
 pyarrow = ">= 11.0.0"
-graphviz = ">= 0.20.1"
 
 fsspec = { version = ">= 2023.4.0", optional = true}
 gcsfs = { version = ">= 2023.4.0", optional = true }
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 27c9c4036..1f109a96c 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -206,10 +206,6 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
 
         dataframe.index = dataframe.index.rename("id").astype("string")
 
-        # logging.info("Visualizing task graph...")
-        # TODO: doesn't work on GCP
-        # dataframe.visualize(filename=f'{self.manifest.base_path}/graph.png')
-
         logger.info("Creating write tasks...")
 
         # Turn index into an empty dataframe so we can write it

From 6e6bd6af759220fb27557d737d2bfef04b52953f Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 1 Aug 2023 15:29:51 +0200
Subject: [PATCH 35/65] More improvements

---
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 components/load_from_hf_hub/src/main.py       |  2 +
 .../components/download_images/Dockerfile     |  2 +-
 .../download_images/fondant_component.yaml    |  3 +-
 .../components/download_images/src/main.py    | 44 ++++++++++++-------
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 examples/pipelines/datacomp/pipeline.py       |  1 +
 7 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 05b65a56e..7a76c1f05 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:3c9ea91ff9221286f9a228c61c3cea44e5499a12
+image: ghcr.io/ml6team/load_from_hf_hub:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index a2103dbbf..1427c0954 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -71,6 +71,8 @@ def load(self) -> dd.DataFrame:
         dask_df["id"] = dask_df.id.cumsum()
         dask_df = dask_df.set_index("id", sort=True)
 
+        print("Length of the dataframe:", len(dask_df))
+
         return dask_df
 
 
diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile
index dcdb7bb29..787f89bb4 100644
--- a/examples/pipelines/datacomp/components/download_images/Dockerfile
+++ b/examples/pipelines/datacomp/components/download_images/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=3c9ea91ff9221286f9a228c61c3cea44e5499a12
+ARG FONDANT_VERSION=7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 760d5b829..069e6888a 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:06b316c830abe72d9d9a71f914c9c5fd205ec88b
+image: ghcr.io/ml6team/download_images:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
 
 consumes:
   image:
@@ -17,6 +17,7 @@ produces:
         type: int16
       height:
         type: int16
+    additionalFields: false
 
 args:
   timeout:
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index c6b89e392..bdc181fa9 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -11,8 +11,9 @@
 import urllib
 
 import dask.dataframe as dd
-from fondant.component import PandasTransformComponent
-from fondant.executor import PandasTransformExecutor
+
+from fondant.component import DaskTransformComponent
+from fondant.executor import DaskTransformExecutor
 from resizer import Resizer
 
 logger = logging.getLogger(__name__)
@@ -79,6 +80,7 @@ def download_image_with_retry(
         img_stream = download_image(
             url, timeout, user_agent_token, disallowed_header_directives,
         )
+        print("Img stream:", img_stream)
         if img_stream is not None:
             # resize the image
             img_str, width, height = resizer(img_stream)
@@ -86,7 +88,7 @@ def download_image_with_retry(
     return None, None, None
 
 
-class DownloadImagesComponent(PandasTransformComponent):
+class DownloadImagesComponent(DaskTransformComponent):
     """Component that downloads images based on URLs."""
 
     def __init__(self,
@@ -123,28 +125,40 @@ def __init__(self,
             max_aspect_ratio=max_aspect_ratio,
         )
 
-    def transform(
-            self,
-            dataframe: dd.DataFrame,
-    ) -> dd.DataFrame:
-        logger.info("Instantiating resizer...")
+    def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
+        logger.info("Downloading images...")
+
+        print("Timeout:", self.timeout)
+        print("Retries:", self.retries)
 
-        dataframe[[("image", "data"), ("image", "width"), ("image", "height")]] = dataframe.apply(
+        print("Columns of dataframe:", dataframe.columns)
+        print("First rows of dataframe:", dataframe.head(5))
+        
+        result = dataframe.apply(
             lambda example: download_image_with_retry(
-                url=example.image.url,
+                url=example.image_url,
                 timeout=self.timeout,
                 retries=self.retries,
                 resizer=self.resizer,
             ),
             axis=1,
+            result_type="expand",
+            meta={0: bytes, 1: int, 2: int},
         )
 
-        # Remove images that could not be fetched
-        dataframe = dataframe.dropna()
+        result.columns = [
+            "images_data",
+            "images_width",
+            "images_height",
+        ]
+
+        print("Length of the result:", len(result))
+        print("Columns of result:", result.columns)
+        print("First rows of result:", result.head())
 
-        return dataframe
+        return result
 
 
 if __name__ == "__main__":
-    executor = PandasTransformExecutor.from_args()
-    executor.execute(DownloadImagesComponent)
+    executor = DaskTransformExecutor.from_args()
+    executor.execute(DownloadImagesComponent)
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 072ae0996..d7052ac07 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:3c9ea91ff9221286f9a228c61c3cea44e5499a12
+image: ghcr.io/ml6team/load_from_hf_hub:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index b43228df9..9e5f4b6ab 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -69,6 +69,7 @@
 )
 download_images_op = ComponentOp(
     component_dir="components/download_images",
+    node_pool_name="n2-standard-128-pool",
 )
 
 # add ops to pipeline

From 3a97346724235e607ffdc2e4981259c95132ccfe Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 1 Aug 2023 16:04:52 +0200
Subject: [PATCH 36/65] Debug

---
 components/load_from_hf_hub/fondant_component.yaml       | 2 +-
 components/load_from_hf_hub/src/main.py                  | 3 +++
 .../components/download_images/fondant_component.yaml    | 2 +-
 .../datacomp/components/download_images/src/main.py      | 9 +++------
 .../components/load_from_hf_hub/fondant_component.yaml   | 2 +-
 examples/pipelines/datacomp/pipeline.py                  | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 7a76c1f05..9b658a2e9 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
+image: ghcr.io/ml6team/load_from_hf_hub:a366ee0d5a3902b618971bb0ede6817bbb7d18f3
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index 1427c0954..4ca9f6749 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -71,6 +71,9 @@ def load(self) -> dd.DataFrame:
         dask_df["id"] = dask_df.id.cumsum()
         dask_df = dask_df.set_index("id", sort=True)
 
+        # let's just load 10 rows for debugging
+        dask_df = dask_df.head(10)
+        dask_df = dd.from_pandas(dask_df, npartitions=1)
         print("Length of the dataframe:", len(dask_df))
 
         return dask_df
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 069e6888a..a32a6dddf 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
+image: ghcr.io/ml6team/download_images:a366ee0d5a3902b618971bb0ede6817bbb7d18f3
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index bdc181fa9..45b9fd615 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -130,19 +130,16 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
 
         print("Timeout:", self.timeout)
         print("Retries:", self.retries)
-
-        print("Columns of dataframe:", dataframe.columns)
-        print("First rows of dataframe:", dataframe.head(5))
         
-        result = dataframe.apply(
+        result = dataframe.map_partitions(
             lambda example: download_image_with_retry(
                 url=example.image_url,
                 timeout=self.timeout,
                 retries=self.retries,
                 resizer=self.resizer,
             ),
-            axis=1,
-            result_type="expand",
+            # axis=1,
+            # result_type="expand",
             meta={0: bytes, 1: int, 2: int},
         )
 
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index d7052ac07..13bb38d5f 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
+image: ghcr.io/ml6team/load_from_hf_hub:a366ee0d5a3902b618971bb0ede6817bbb7d18f3
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 9e5f4b6ab..15475eef1 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 

From d1882ec19d82657c2127ccbfbb14685266cc11b1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 1 Aug 2023 16:15:55 +0200
Subject: [PATCH 37/65] Use Pandas component

---
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 components/load_from_hf_hub/src/main.py       |  5 ---
 .../components/download_images/src/main.py    | 36 +++++++++----------
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 examples/pipelines/datacomp/pipeline.py       |  2 +-
 5 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 9b658a2e9..94fa473ea 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:a366ee0d5a3902b618971bb0ede6817bbb7d18f3
+image: ghcr.io/ml6team/load_from_hf_hub:184080f269fe2f17e01d418fda221733003ac2b5
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
index 4ca9f6749..a2103dbbf 100644
--- a/components/load_from_hf_hub/src/main.py
+++ b/components/load_from_hf_hub/src/main.py
@@ -71,11 +71,6 @@ def load(self) -> dd.DataFrame:
         dask_df["id"] = dask_df.id.cumsum()
         dask_df = dask_df.set_index("id", sort=True)
 
-        # let's just load 10 rows for debugging
-        dask_df = dask_df.head(10)
-        dask_df = dd.from_pandas(dask_df, npartitions=1)
-        print("Length of the dataframe:", len(dask_df))
-
         return dask_df
 
 
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index 45b9fd615..249c1f8fb 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -10,10 +10,10 @@
 import traceback
 import urllib
 
-import dask.dataframe as dd
+import pandas as pd
 
-from fondant.component import DaskTransformComponent
-from fondant.executor import DaskTransformExecutor
+from fondant.component import PandasTransformComponent
+from fondant.executor import PandasTransformExecutor
 from resizer import Resizer
 
 logger = logging.getLogger(__name__)
@@ -88,7 +88,7 @@ def download_image_with_retry(
     return None, None, None
 
 
-class DownloadImagesComponent(DaskTransformComponent):
+class DownloadImagesComponent(PandasTransformComponent):
     """Component that downloads images based on URLs."""
 
     def __init__(self,
@@ -125,37 +125,37 @@ def __init__(self,
             max_aspect_ratio=max_aspect_ratio,
         )
 
-    def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
+    def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         logger.info("Downloading images...")
 
         print("Timeout:", self.timeout)
         print("Retries:", self.retries)
         
-        result = dataframe.map_partitions(
+        dataframe[[("image", "data"), ("image", "width"), ("images", "height")]] = dataframe["image"]["url"].apply(
             lambda example: download_image_with_retry(
-                url=example.image_url,
+                url=example,
                 timeout=self.timeout,
                 retries=self.retries,
                 resizer=self.resizer,
             ),
             # axis=1,
             # result_type="expand",
-            meta={0: bytes, 1: int, 2: int},
+            # meta={0: bytes, 1: int, 2: int},
         )
 
-        result.columns = [
-            "images_data",
-            "images_width",
-            "images_height",
-        ]
+        # result.columns = [
+        #     "images_data",
+        #     "images_width",
+        #     "images_height",
+        # ]
 
-        print("Length of the result:", len(result))
-        print("Columns of result:", result.columns)
-        print("First rows of result:", result.head())
+        # print("Length of the result:", len(result))
+        # print("Columns of result:", result.columns)
+        # print("First rows of result:", result.head())
 
-        return result
+        return dataframe
 
 
 if __name__ == "__main__":
-    executor = DaskTransformExecutor.from_args()
+    executor = PandasTransformExecutor.from_args()
     executor.execute(DownloadImagesComponent)
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 13bb38d5f..ca83d6efd 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:a366ee0d5a3902b618971bb0ede6817bbb7d18f3
+image: ghcr.io/ml6team/load_from_hf_hub:184080f269fe2f17e01d418fda221733003ac2b5
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 15475eef1..6f09d7f18 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,7 +39,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 10000,
+        "n_rows_to_load": 10,
         "dataset_length": 12800000,
     },
     node_pool_name="n2-standard-128-pool",

From f81f12256ce41ee69e7af37ac3673a26f4a1f8b9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 1 Aug 2023 17:26:59 +0200
Subject: [PATCH 38/65] Run on 1000 images

---
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 .../download_images/fondant_component.yaml    |  6 +--
 .../components/download_images/src/main.py    | 43 +++++++------------
 .../load_from_hf_hub/fondant_component.yaml   |  2 +-
 examples/pipelines/datacomp/pipeline.py       |  3 +-
 5 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 94fa473ea..072794d95 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:184080f269fe2f17e01d418fda221733003ac2b5
+image: ghcr.io/ml6team/load_from_hf_hub:de73f003806455e248125d3b1ce19c1a52aea8ea
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index a32a6dddf..9844b2200 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:a366ee0d5a3902b618971bb0ede6817bbb7d18f3
+image: ghcr.io/ml6team/download_images:de73f003806455e248125d3b1ce19c1a52aea8ea
 
 consumes:
   image:
@@ -14,9 +14,9 @@ produces:
       data:
         type: binary
       width:
-        type: int16
+        type: int64
       height:
-        type: int16
+        type: int64
     additionalFields: false
 
 args:
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index 249c1f8fb..0950e82e3 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -10,10 +10,10 @@
 import traceback
 import urllib
 
-import pandas as pd
+import dask.dataframe as dd
 
-from fondant.component import PandasTransformComponent
-from fondant.executor import PandasTransformExecutor
+from fondant.component import DaskTransformComponent
+from fondant.executor import DaskTransformExecutor
 from resizer import Resizer
 
 logger = logging.getLogger(__name__)
@@ -88,7 +88,7 @@ def download_image_with_retry(
     return None, None, None
 
 
-class DownloadImagesComponent(PandasTransformComponent):
+class DownloadImagesComponent(DaskTransformComponent):
     """Component that downloads images based on URLs."""
 
     def __init__(self,
@@ -125,37 +125,26 @@ def __init__(self,
             max_aspect_ratio=max_aspect_ratio,
         )
 
-    def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+    def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         logger.info("Downloading images...")
-
-        print("Timeout:", self.timeout)
-        print("Retries:", self.retries)
         
-        dataframe[[("image", "data"), ("image", "width"), ("images", "height")]] = dataframe["image"]["url"].apply(
+        result = dataframe.apply(
             lambda example: download_image_with_retry(
-                url=example,
-                timeout=self.timeout,
-                retries=self.retries,
-                resizer=self.resizer,
+                url=example.image_url, timeout=self.timeout, retries=self.retries, resizer=self.resizer,
             ),
-            # axis=1,
-            # result_type="expand",
-            # meta={0: bytes, 1: int, 2: int},
+            axis=1,
+            result_type='expand',
+            meta={0: object, 1: int, 2: int},
         )
+        result.columns = ['image_data', 'image_width', 'image_height']
 
-        # result.columns = [
-        #     "images_data",
-        #     "images_width",
-        #     "images_height",
-        # ]
-
-        # print("Length of the result:", len(result))
-        # print("Columns of result:", result.columns)
-        # print("First rows of result:", result.head())
+        print("Length of the final dataframe:", len(dataframe))
+        print("First few rows of final dataframe:")
+        print(result.head(5))
 
-        return dataframe
+        return result
 
 
 if __name__ == "__main__":
-    executor = PandasTransformExecutor.from_args()
+    executor = DaskTransformExecutor.from_args()
     executor.execute(DownloadImagesComponent)
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index ca83d6efd..ecce89a91 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:184080f269fe2f17e01d418fda221733003ac2b5
+image: ghcr.io/ml6team/load_from_hf_hub:de73f003806455e248125d3b1ce19c1a52aea8ea
 
 produces:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 6f09d7f18..e70ce3b9c 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -39,7 +39,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 10,
+        "n_rows_to_load": 1000,
         "dataset_length": 12800000,
     },
     node_pool_name="n2-standard-128-pool",
@@ -70,6 +70,7 @@
 download_images_op = ComponentOp(
     component_dir="components/download_images",
     node_pool_name="n2-standard-128-pool",
+    output_partition_size="disable",
 )
 
 # add ops to pipeline

From 0c711303bd00ed63712037bbae9dea1b89f6009c Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 2 Aug 2023 10:23:37 +0200
Subject: [PATCH 39/65] Use map_partitions

---
 .../download_images/fondant_component.yaml    |  2 +-
 .../components/download_images/src/main.py    | 48 +++++++++++++++----
 examples/pipelines/datacomp/pipeline.py       |  6 +--
 3 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 9844b2200..3e14e3958 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:de73f003806455e248125d3b1ce19c1a52aea8ea
+image: ghcr.io/ml6team/download_images:aa4cf164c762cba00480c1335251d81a2c10fc44
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index 0950e82e3..da554bc2c 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -11,6 +11,7 @@
 import urllib
 
 import dask.dataframe as dd
+import numpy as np
 
 from fondant.component import DaskTransformComponent
 from fondant.executor import DaskTransformExecutor
@@ -88,6 +89,24 @@ def download_image_with_retry(
     return None, None, None
 
 
+def download_image_with_retry_partition(df, timeout, retries, resizer):
+    # process a single partition
+    # TODO make column name more flexible
+    data = df.image_url.apply(lambda x:
+        download_image_with_retry(
+            url=x, timeout=timeout, retries=retries, resizer=resizer,
+        ),
+    )
+
+    # use assign to add values as extra columns
+    df = df.assign(data=[example[0] for example in data],
+                   width=[example[1] for example in data],
+                   height=[example[2] for example in data],
+    )
+
+    return df
+
+
 class DownloadImagesComponent(DaskTransformComponent):
     """Component that downloads images based on URLs."""
 
@@ -128,21 +147,30 @@ def __init__(self,
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         logger.info("Downloading images...")
         
-        result = dataframe.apply(
-            lambda example: download_image_with_retry(
-                url=example.image_url, timeout=self.timeout, retries=self.retries, resizer=self.resizer,
-            ),
-            axis=1,
-            result_type='expand',
-            meta={0: object, 1: int, 2: int},
+        # create meta
+        # needs to be a dictionary with keys = column names, values = dtypes of columns
+        # for each column in the output
+        meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)}
+        meta["data"] = np.dtype(bytes) 
+        meta["width"] = np.dtype(int) 
+        meta["height"] = np.dtype(int) 
+
+        dataframe = dataframe.map_partitions(
+            download_image_with_retry_partition,
+            timeout=self.timeout,
+            retries=self.retries,
+            resizer=self.resizer,
+            meta=meta,
         )
-        result.columns = ['image_data', 'image_width', 'image_height']
+
+        # rename new columns to be conform the spec
+        dataframe = dataframe.rename(columns={"data": "image_data", "width": "image_width", "height":"image_height"})  
 
         print("Length of the final dataframe:", len(dataframe))
         print("First few rows of final dataframe:")
-        print(result.head(5))
+        print(dataframe.head(5))
 
-        return result
+        return dataframe
 
 
 if __name__ == "__main__":
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index e70ce3b9c..c34e77949 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -39,7 +39,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 1000,
+        "n_rows_to_load": 50000,
         "dataset_length": 12800000,
     },
     node_pool_name="n2-standard-128-pool",

From dd9d06d649a0048bad718c235948317c241cdc9a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 2 Aug 2023 10:56:11 +0200
Subject: [PATCH 40/65] Add logging

---
 .../pipelines/datacomp/components/download_images/src/main.py   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index da554bc2c..93cb50ed2 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -145,6 +145,8 @@ def __init__(self,
         )
 
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
+
+        logger.info("Length of the dataframe:", len(dataframe))
         logger.info("Downloading images...")
         
         # create meta

From a940050d378774ded4226a8e0838a175d6f65712 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 2 Aug 2023 12:08:10 +0200
Subject: [PATCH 41/65] More improvements

---
 .../download_images/fondant_component.yaml      | 17 +++++++++++++++--
 examples/pipelines/datacomp/pipeline.py         |  4 ++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 3e14e3958..651d1230a 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:aa4cf164c762cba00480c1335251d81a2c10fc44
+image: ghcr.io/ml6team/download_images:50c28a05f04587c8fc445ab8199cbf16fb32dcac
 
 consumes:
   image:
@@ -11,13 +11,26 @@ consumes:
 produces:
   image:
     fields:
+      url:
+        type: string
       data:
         type: binary
       width:
         type: int64
       height:
         type: int64
-    additionalFields: false
+      face_bboxes:
+        type: array
+        items:
+          type: array
+          items:
+            type: float32
+      sha256:
+        type: utf8
+      embedding:
+        type: array
+        items:
+          type: float32
 
 args:
   timeout:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index c34e77949..dfb8f8342 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 

From 3ae578ad1bfa9278f4435d3007befc94c8292eb1 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 2 Aug 2023 12:20:32 +0200
Subject: [PATCH 42/65] More improvements

---
 components/load_from_hf_hub/fondant_component.yaml | 2 +-
 examples/pipelines/datacomp/pipeline.py            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 072794d95..b03bb1c7a 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:de73f003806455e248125d3b1ce19c1a52aea8ea
+image: ghcr.io/ml6team/load_from_hf_hub:50c28a05f04587c8fc445ab8199cbf16fb32dcac
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index dfb8f8342..c34e77949 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 

From e490a2eb88c1799dcde17db7101f00ac32ddd1f3 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Wed, 2 Aug 2023 13:50:48 +0200
Subject: [PATCH 43/65] Fix rebase

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 5db8abca7..d5a4a0f18 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=f3f3925b8e8f634e2978e5c7fcefa72c53baba7c
+ARG FONDANT_VERSION=e268128ab04bb8cfa030928d43efb0a3b77caad5
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index b03bb1c7a..537c304db 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:50c28a05f04587c8fc445ab8199cbf16fb32dcac
+image: ghcr.io/ml6team/load_from_hf_hub:e268128ab04bb8cfa030928d43efb0a3b77caad5
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index ecce89a91..a74d0ecfd 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:de73f003806455e248125d3b1ce19c1a52aea8ea
+image: ghcr.io/ml6team/load_from_hf_hub:e268128ab04bb8cfa030928d43efb0a3b77caad5
 
 produces:
   image:

From edc65f516067937401e325650d5b5409f071bc39 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 08:49:24 +0200
Subject: [PATCH 44/65] More improvements

---
 components/load_from_hf_hub/Dockerfile           |  2 +-
 .../load_from_hf_hub/fondant_component.yaml      |  2 +-
 .../download_images/fondant_component.yaml       | 16 +---------------
 .../components/download_images/src/main.py       |  5 ++++-
 .../load_from_hf_hub/fondant_component.yaml      |  2 +-
 5 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index d5a4a0f18..30d994d98 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=e268128ab04bb8cfa030928d43efb0a3b77caad5
+ARG FONDANT_VERSION=5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 537c304db..47fb2de4d 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:e268128ab04bb8cfa030928d43efb0a3b77caad5
+image: ghcr.io/ml6team/load_from_hf_hub:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 651d1230a..c83cf87d7 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:50c28a05f04587c8fc445ab8199cbf16fb32dcac
+image: ghcr.io/ml6team/download_images:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41
 
 consumes:
   image:
@@ -11,26 +11,12 @@ consumes:
 produces:
   image:
     fields:
-      url:
-        type: string
       data:
         type: binary
       width:
         type: int64
       height:
         type: int64
-      face_bboxes:
-        type: array
-        items:
-          type: array
-          items:
-            type: float32
-      sha256:
-        type: utf8
-      embedding:
-        type: array
-        items:
-          type: float32
 
 args:
   timeout:
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index 93cb50ed2..377c9ccef 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -166,7 +166,10 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         )
 
         # rename new columns to be conform the spec
-        dataframe = dataframe.rename(columns={"data": "image_data", "width": "image_width", "height":"image_height"})  
+        dataframe = dataframe.rename(columns={"data": "image_data", "width": "image_width", "height":"image_height"})
+
+        # Remove images that could not be fetched
+        dataframe = dataframe.dropna()  
 
         print("Length of the final dataframe:", len(dataframe))
         print("First few rows of final dataframe:")
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index a74d0ecfd..7a65e0bc0 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:e268128ab04bb8cfa030928d43efb0a3b77caad5
+image: ghcr.io/ml6team/load_from_hf_hub:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41
 
 produces:
   image:

From 7acbeba1796d23b8e055c62a7ebc98c2d339163b Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 08:52:57 +0200
Subject: [PATCH 45/65] Fix rebase

---
 components/load_from_hf_hub/Dockerfile                          | 2 +-
 components/load_from_hf_hub/fondant_component.yaml              | 2 +-
 .../datacomp/components/load_from_hf_hub/fondant_component.yaml | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/components/load_from_hf_hub/Dockerfile b/components/load_from_hf_hub/Dockerfile
index 30d994d98..7df92b7f6 100644
--- a/components/load_from_hf_hub/Dockerfile
+++ b/components/load_from_hf_hub/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41
+ARG FONDANT_VERSION=edc65f516067937401e325650d5b5409f071bc39
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
index 47fb2de4d..f4baa6378 100644
--- a/components/load_from_hf_hub/fondant_component.yaml
+++ b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41
+image: ghcr.io/ml6team/load_from_hf_hub:edc65f516067937401e325650d5b5409f071bc39
 
 produces:
   dummy_variable:  #TODO: fill in here
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 7a65e0bc0..44d96a8ab 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -31,6 +31,8 @@ produces:
   
   image_text:
     fields:
+      uid:
+        type: string
       clip_b32_similarity_score:
         type: float32
       clip_l14_similarity_score:

From f225ef9e6b24c38795d841a98c9e0957c120eff8 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 09:55:40 +0200
Subject: [PATCH 46/65] Include uids

---
 .../components/download_images/fondant_component.yaml       | 2 +-
 .../components/load_from_hf_hub/fondant_component.yaml      | 6 +++---
 examples/pipelines/datacomp/pipeline.py                     | 4 ++++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index c83cf87d7..5e846f2ae 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41
+image: ghcr.io/ml6team/download_images:edc65f516067937401e325650d5b5409f071bc39
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index 44d96a8ab..dd48dd285 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,10 +1,12 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:5a1bae24ad16d7a8cfd3aa09fbe15feb813c6d41
+image: ghcr.io/ml6team/load_from_hf_hub:edc65f516067937401e325650d5b5409f071bc39
 
 produces:
   image:
     fields:
+      uid:
+        type: string
       url:
         type: string
       width:
@@ -31,8 +33,6 @@ produces:
   
   image_text:
     fields:
-      uid:
-        type: string
       clip_b32_similarity_score:
         type: float32
       clip_l14_similarity_score:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index c34e77949..7f7a27164 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -22,6 +22,7 @@
 
 # define ops
 load_component_column_mapping = {
+    "uid": "image_uid",
     "url": "image_url",
     "original_width": "image_width",
     "original_height": "image_height",
@@ -42,12 +43,14 @@
         "n_rows_to_load": 50000,
         "dataset_length": 12800000,
     },
+    node_pool_label="node_pool",
     node_pool_name="n2-standard-128-pool",
     # output_partition_size="10MB",
 )
 filter_image_resolution_op = ComponentOp.from_registry(
     name="filter_image_resolution",
     arguments={"min_image_dim": 200, "max_aspect_ratio": 3},
+    node_pool_label="node_pool",
     node_pool_name="n2-standard-128-pool",
     output_partition_size='disable',
 )
@@ -69,6 +72,7 @@
 )
 download_images_op = ComponentOp(
     component_dir="components/download_images",
+    node_pool_label="node_pool",
     node_pool_name="n2-standard-128-pool",
     output_partition_size="disable",
 )

From 1bfe2593b5058b663b608f21740425b482c30b5c Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 10:38:10 +0200
Subject: [PATCH 47/65] More improvements

---
 .../components/detect_text/Dockerfile         |  23 +++
 .../detect_text/fondant_component.yaml        |  17 +++
 .../components/detect_text/requirements.txt   |   6 +
 .../components/detect_text/src/main.py        | 141 ++++++++++++++++++
 .../load_from_hf_hub/fondant_component.yaml   |   4 +-
 examples/pipelines/datacomp/pipeline.py       |   8 +-
 6 files changed, 193 insertions(+), 6 deletions(-)
 create mode 100644 examples/pipelines/datacomp/components/detect_text/Dockerfile
 create mode 100644 examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
 create mode 100644 examples/pipelines/datacomp/components/detect_text/requirements.txt
 create mode 100644 examples/pipelines/datacomp/components/detect_text/src/main.py

diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile
new file mode 100644
index 000000000..787f89bb4
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
new file mode 100644
index 000000000..2f1026e53
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -0,0 +1,17 @@
+name: Detect text
+description: Component that detects text in images
+image: ghcr.io/ml6team/detext_text:edc65f516067937401e325650d5b5409f071bc39
+
+consumes:
+  image:
+    fields:
+      data:
+        type: bytes
+
+produces:
+  image:
+    fields:
+      data:
+        type: binary
+      detected_boxes:
+        type: int64
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text/requirements.txt b/examples/pipelines/datacomp/components/detect_text/requirements.txt
new file mode 100644
index 000000000..10913ea37
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text/requirements.txt
@@ -0,0 +1,6 @@
+huggingface-hub==0.16.4
+easyocr==1.7.0
+onnxruntime==1.15.1
+onnxruntime-gpu==1.15.1
+Pillow==10.0.0
+torch==2.0.1
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
new file mode 100644
index 000000000..c10a193e0
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -0,0 +1,141 @@
+"""This component detexts text in images, using CRAFT.
+"""
+import logging
+
+import dask.dataframe as dd
+import numpy as np
+import io
+from PIL import Image
+import pandas as pd
+
+from huggingface_hub import hf_hub_download
+
+from easyocr.craft_utils import getDetBoxes, adjustResultCoordinates
+from easyocr.imgproc import normalizeMeanVariance
+from easyocr.utils import group_text_box
+
+import torch
+import onnxruntime as ort
+
+from fondant.component import DaskTransformComponent
+from fondant.executor import DaskTransformExecutor
+
+logger = logging.getLogger(__name__)
+
+
+def resize_aspect_ratio_pillow(img, square_size, mag_ratio=1):
+    height, width, channel = img.shape
+
+    # magnify image size
+    target_size = mag_ratio * max(height, width)
+
+    # set original image size
+    if target_size > square_size:
+        target_size = square_size
+
+    ratio = target_size / max(height, width)
+
+    target_h, target_w = int(height * ratio), int(width * ratio)
+    img = Image.fromarray(img)
+    proc = img.resize((target_w, target_h), resample = Image.BILINEAR)
+
+    # make canvas and paste image
+    target_h32, target_w32 = target_h, target_w
+    if target_h % 32 != 0:
+        target_h32 = target_h + (32 - target_h % 32)
+    if target_w % 32 != 0:
+        target_w32 = target_w + (32 - target_w % 32)
+    resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32)
+    resized[0:target_h, 0:target_w, :] = proc
+    target_h, target_w = target_h32, target_w32
+
+    size_heatmap = (int(target_w/2), int(target_h/2))
+
+    return resized, ratio, size_heatmap
+
+
+def get_boxes(image_data, session):
+    try:
+      image = Image.open(io.BytesIO(image_data)).convert("RGB")
+      image = np.array(image)
+    except:
+      return [None]
+
+    # Use Pillow instead of cv2
+    img_resized, target_ratio, size_heatmap = resize_aspect_ratio_pillow(img=image,
+                  square_size=512,
+                  mag_ratio=1.0)
+
+    ratio_h = ratio_w = 1 / target_ratio
+    x = normalizeMeanVariance(img_resized)
+    x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)
+
+    input_name = session.get_inputs()[0].name
+
+    # Prepare input tensor for inference
+    inp = {input_name: x.numpy()}
+
+    # Run inference and get output
+    y, _ = session.run(None, inp)
+
+    # Extract score and link maps
+    score_text = y[0, :, :, 0]
+    score_link = y[0, :, :, 1]
+
+    # Post-processing to obtain bounding boxes and polygons
+    boxes, _, _ = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4)
+    boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h)
+
+    # Create horizontal reading list
+    polys = []
+    for box in boxes:
+      poly = np.array(box).astype(np.int32).reshape((-1))
+      polys.append(poly)
+
+    horizontal_list, _ = group_text_box(polys)
+
+    return horizontal_list
+
+
+def get_boxes_dataframe(df, session):
+    # process a single partition
+    # TODO make column name more flexible
+    df["image_detected_boxes"] = df.image_data.apply(lambda x:
+        get_boxes(
+            image_data=x, session=session,
+        ),
+    )
+
+    return df
+
+
+class DetextTextComponent(DaskTransformComponent):
+    """Component that detexts text in images, using the CRAFT model.
+    """
+
+    def __init__(self, *args) -> None:
+
+        craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model")
+        providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider']
+        self.session = ort.InferenceSession(craft_onnx, providers=providers)
+
+    def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
+
+        # create meta
+        # needs to be a dictionary with keys = column names, values = dtypes of columns
+        # for each column in the output
+        meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)}
+        meta["image_detected_boxes"] = np.dtype(object) 
+
+        dataframe = dataframe.map_partitions(
+            get_boxes_dataframe,
+            session=self.session,
+            meta=meta,
+        )
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    executor = DaskTransformExecutor.from_args()
+    executor.execute(DetextTextComponent)
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
index dd48dd285..01e5994ff 100644
--- a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -5,8 +5,6 @@ image: ghcr.io/ml6team/load_from_hf_hub:edc65f516067937401e325650d5b5409f071bc39
 produces:
   image:
     fields:
-      uid:
-        type: string
       url:
         type: string
       width:
@@ -33,6 +31,8 @@ produces:
   
   image_text:
     fields:
+      uid:
+        type: string
       clip_b32_similarity_score:
         type: float32
       clip_l14_similarity_score:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 7f7a27164..db4fcccbe 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,14 +15,13 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
 # define ops
 load_component_column_mapping = {
-    "uid": "image_uid",
     "url": "image_url",
     "original_width": "image_width",
     "original_height": "image_height",
@@ -30,6 +29,7 @@
     "sha256": "image_sha256",
     "clip_l14_embedding": "image_embedding",
     "text": "text_data",
+    "uid": "image_text_uid",
     "clip_b32_similarity_score": "image_text_clip_b32_similarity_score",
     "clip_l14_similarity_score": "image_text_clip_l14_similarity_score",
 }
@@ -40,7 +40,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 50000,
+        "n_rows_to_load": 10,
         "dataset_length": 12800000,
     },
     node_pool_label="node_pool",

From 025399a9663800c25214ee718ef4b4661a5e00df Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 11:31:41 +0200
Subject: [PATCH 48/65] More improvements

---
 .../datacomp/components/detect_text/Dockerfile       |  2 +-
 .../components/detect_text/fondant_component.yaml    | 10 +++++++---
 .../datacomp/components/detect_text/src/main.py      |  3 ++-
 examples/pipelines/datacomp/pipeline.py              | 12 ++++++++++--
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile
index 787f89bb4..1430c6972 100644
--- a/examples/pipelines/datacomp/components/detect_text/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
+ARG FONDANT_VERSION=1bfe2593b5058b663b608f21740425b482c30b5c
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 2f1026e53..49d6688c5 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,12 +1,12 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detext_text:edc65f516067937401e325650d5b5409f071bc39
+image: ghcr.io/ml6team/detect_text:1bfe2593b5058b663b608f21740425b482c30b5c
 
 consumes:
   image:
     fields:
       data:
-        type: bytes
+        type: binary
 
 produces:
   image:
@@ -14,4 +14,8 @@ produces:
       data:
         type: binary
       detected_boxes:
-        type: int64
\ No newline at end of file
+        type: array
+        items:
+          type: array
+          items:
+            type: int64
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index c10a193e0..ca4f23fc1 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -6,7 +6,6 @@
 import numpy as np
 import io
 from PIL import Image
-import pandas as pd
 
 from huggingface_hub import hf_hub_download
 
@@ -116,6 +115,7 @@ class DetextTextComponent(DaskTransformComponent):
     def __init__(self, *args) -> None:
 
         craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model")
+        logger.info("Device:" ort.get_device())
         providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider']
         self.session = ort.InferenceSession(craft_onnx, providers=providers)
 
@@ -127,6 +127,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)}
         meta["image_detected_boxes"] = np.dtype(object) 
 
+        logger.info("Detecting texts..")
         dataframe = dataframe.map_partitions(
             get_boxes_dataframe,
             session=self.session,
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index db4fcccbe..9beafe25a 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -76,12 +76,20 @@
     node_pool_name="n2-standard-128-pool",
     output_partition_size="disable",
 )
+detect_text_op = ComponentOp(
+    component_dir="components/detect_text",
+    node_pool_label="node_pool",
+    node_pool_name="n2-standard-128-pool",
+    output_partition_size="disable",
+)
+
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
 # pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 pipeline.add_op(download_images_op, dependencies=load_from_hub_op)
+pipeline.add_op(detect_text_op, dependencies=download_images_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 

From 0c9ac7f63812be43c3cffc30396ef4a71cd0578c Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 11:58:09 +0200
Subject: [PATCH 49/65] More improvements

---
 .../datacomp/components/detect_text/fondant_component.yaml   | 2 +-
 .../pipelines/datacomp/components/detect_text/src/main.py    | 2 +-
 examples/pipelines/datacomp/pipeline.py                      | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 49d6688c5..9f5931812 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:1bfe2593b5058b663b608f21740425b482c30b5c
+image: ghcr.io/ml6team/detect_text:025399a9663800c25214ee718ef4b4661a5e00df
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index ca4f23fc1..1acc0d162 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -115,7 +115,7 @@ class DetextTextComponent(DaskTransformComponent):
     def __init__(self, *args) -> None:
 
         craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model")
-        logger.info("Device:" ort.get_device())
+        logger.info("Device:", ort.get_device())
         providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider']
         self.session = ort.InferenceSession(craft_onnx, providers=providers)
 
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 9beafe25a..0384cde31 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -78,8 +78,9 @@
 )
 detect_text_op = ComponentOp(
     component_dir="components/detect_text",
-    node_pool_label="node_pool",
-    node_pool_name="n2-standard-128-pool",
+    number_of_gpus=1,
+    node_pool_label="node_pool",  
+    node_pool_name="model-inference-pool",  
     output_partition_size="disable",
 )
 

From c179edf9bc4b95c0624750a0f51d5c82d5ee68a2 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 13:30:11 +0200
Subject: [PATCH 50/65] More improvements

---
 .../datacomp/components/detect_text/fondant_component.yaml  | 2 +-
 .../pipelines/datacomp/components/detect_text/src/main.py   | 2 ++
 .../datacomp/components/download_images/src/main.py         | 6 ++----
 examples/pipelines/datacomp/pipeline.py                     | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 9f5931812..befdf1dc3 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:025399a9663800c25214ee718ef4b4661a5e00df
+image: ghcr.io/ml6team/detect_text:0c9ac7f63812be43c3cffc30396ef4a71cd0578c
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index 1acc0d162..15a2dca87 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -121,6 +121,8 @@ def __init__(self, *args) -> None:
 
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
 
+        logger.info(f"Length of the dataframe: {len(dataframe)}")
+
         # create meta
         # needs to be a dictionary with keys = column names, values = dtypes of columns
         # for each column in the output
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index 377c9ccef..10932daa5 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -146,7 +146,7 @@ def __init__(self,
 
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
 
-        logger.info("Length of the dataframe:", len(dataframe))
+        logger.info(f"Length of the dataframe: {len(dataframe)}")
         logger.info("Downloading images...")
         
         # create meta
@@ -171,9 +171,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         # Remove images that could not be fetched
         dataframe = dataframe.dropna()  
 
-        print("Length of the final dataframe:", len(dataframe))
-        print("First few rows of final dataframe:")
-        print(dataframe.head(5))
+        logger.info(f"Length of the final dataframe: {len(dataframe)}")
 
         return dataframe
 
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 0384cde31..1eb5c8a3d 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -90,7 +90,7 @@
 # pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 pipeline.add_op(download_images_op, dependencies=load_from_hub_op)
-pipeline.add_op(detect_text_op, dependencies=download_images_op)
+# pipeline.add_op(detect_text_op, dependencies=download_images_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 

From 78660256b14287b586b374afcd129ea23f538cc6 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 14:45:02 +0200
Subject: [PATCH 51/65] More improvements

---
 .../datacomp/components/detect_text/fondant_component.yaml   | 2 +-
 .../datacomp/components/detect_text/requirements.txt         | 1 -
 .../pipelines/datacomp/components/detect_text/src/main.py    | 5 ++++-
 .../components/download_images/fondant_component.yaml        | 2 +-
 examples/pipelines/datacomp/pipeline.py                      | 4 ++--
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index befdf1dc3..f647bf08f 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:0c9ac7f63812be43c3cffc30396ef4a71cd0578c
+image: ghcr.io/ml6team/detect_text:c179edf9bc4b95c0624750a0f51d5c82d5ee68a2
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/requirements.txt b/examples/pipelines/datacomp/components/detect_text/requirements.txt
index 10913ea37..16a68c71b 100644
--- a/examples/pipelines/datacomp/components/detect_text/requirements.txt
+++ b/examples/pipelines/datacomp/components/detect_text/requirements.txt
@@ -1,6 +1,5 @@
 huggingface-hub==0.16.4
 easyocr==1.7.0
-onnxruntime==1.15.1
 onnxruntime-gpu==1.15.1
 Pillow==10.0.0
 torch==2.0.1
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index 15a2dca87..72c6d771c 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -115,7 +115,7 @@ class DetextTextComponent(DaskTransformComponent):
     def __init__(self, *args) -> None:
 
         craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model")
-        logger.info("Device:", ort.get_device())
+        logger.info(f"Device: {ort.get_device()}")
         providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider']
         self.session = ort.InferenceSession(craft_onnx, providers=providers)
 
@@ -136,6 +136,9 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
             meta=meta,
         )
 
+        logger.info(f"Length of the final dataframe: {len(dataframe)}")
+        print("First rows of final dataframe:", dataframe.head())
+
         return dataframe
 
 
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 5e846f2ae..b756099c8 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:edc65f516067937401e325650d5b5409f071bc39
+image: ghcr.io/ml6team/download_images:c179edf9bc4b95c0624750a0f51d5c82d5ee68a2
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 1eb5c8a3d..666656403 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 

From 22de5f60ae89d6f6efc56a6d1002e147cab75fd2 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 15:13:31 +0200
Subject: [PATCH 52/65] Use cpu for now

---
 .../datacomp/components/detect_text/fondant_component.yaml  | 2 +-
 .../pipelines/datacomp/components/detect_text/src/main.py   | 1 +
 examples/pipelines/datacomp/pipeline.py                     | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index f647bf08f..db667c29f 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:c179edf9bc4b95c0624750a0f51d5c82d5ee68a2
+image: ghcr.io/ml6team/detect_text:78660256b14287b586b374afcd129ea23f538cc6
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index 72c6d771c..f4a27c562 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -117,6 +117,7 @@ def __init__(self, *args) -> None:
         craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model")
         logger.info(f"Device: {ort.get_device()}")
         providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider']
+        providers = ['CPUExecutionProvider']
         self.session = ort.InferenceSession(craft_onnx, providers=providers)
 
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 666656403..138ff986f 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -90,7 +90,7 @@
 # pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 pipeline.add_op(download_images_op, dependencies=load_from_hub_op)
-# pipeline.add_op(detect_text_op, dependencies=download_images_op)
+pipeline.add_op(detect_text_op, dependencies=download_images_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 

From dcc714e12374928e893b2fd6865b151923f0a791 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 16:39:03 +0200
Subject: [PATCH 53/65] More improvements

---
 .../datacomp/components/detect_text/fondant_component.yaml  | 4 ++--
 .../pipelines/datacomp/components/detect_text/src/main.py   | 5 +++--
 .../components/download_images/fondant_component.yaml       | 2 +-
 examples/pipelines/datacomp/pipeline.py                     | 6 +++---
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index db667c29f..46e9f8252 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:78660256b14287b586b374afcd129ea23f538cc6
+image: ghcr.io/ml6team/detect_text:22de5f60ae89d6f6efc56a6d1002e147cab75fd2
 
 consumes:
   image:
@@ -13,7 +13,7 @@ produces:
     fields:
       data:
         type: binary
-      detected_boxes:
+      boxes:
         type: array
         items:
           type: array
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index f4a27c562..8d0d8fe0b 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -99,7 +99,7 @@ def get_boxes(image_data, session):
 def get_boxes_dataframe(df, session):
     # process a single partition
     # TODO make column name more flexible
-    df["image_detected_boxes"] = df.image_data.apply(lambda x:
+    df["image_boxes"] = df.image_data.apply(lambda x:
         get_boxes(
             image_data=x, session=session,
         ),
@@ -128,7 +128,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         # needs to be a dictionary with keys = column names, values = dtypes of columns
         # for each column in the output
         meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)}
-        meta["image_detected_boxes"] = np.dtype(object) 
+        meta["image_boxes"] = np.dtype(object) 
 
         logger.info("Detecting texts..")
         dataframe = dataframe.map_partitions(
@@ -138,6 +138,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         )
 
         logger.info(f"Length of the final dataframe: {len(dataframe)}")
+        print("Columns of the final dataframe", dataframe.columns)
         print("First rows of final dataframe:", dataframe.head())
 
         return dataframe
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index b756099c8..620984e9e 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:c179edf9bc4b95c0624750a0f51d5c82d5ee68a2
+image: ghcr.io/ml6team/download_images:22de5f60ae89d6f6efc56a6d1002e147cab75fd2
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 138ff986f..dfe6f9f4f 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -40,7 +40,7 @@
         "dataset_name": "nielsr/datacomp-small-with-embeddings",
         "image_column_names": [],
         "column_name_mapping": load_component_column_mapping,
-        "n_rows_to_load": 10,
+        "n_rows_to_load": 1000,
         "dataset_length": 12800000,
     },
     node_pool_label="node_pool",

From fa341a58a463c825f90198a42adbe40ce1cc600a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 16:58:59 +0200
Subject: [PATCH 54/65] Run text detection on 1000 images

---
 .../datacomp/components/detect_text/fondant_component.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 46e9f8252..54aaffd6b 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:22de5f60ae89d6f6efc56a6d1002e147cab75fd2
+image: ghcr.io/ml6team/detect_text:dcc714e12374928e893b2fd6865b151923f0a791
 
 consumes:
   image:

From dada0106b4a39ad53025704541231d350ee262a2 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 17:49:26 +0200
Subject: [PATCH 55/65] Remove print statement

---
 examples/pipelines/datacomp/components/detect_text/Dockerfile | 4 ++--
 .../datacomp/components/detect_text/fondant_component.yaml    | 2 +-
 .../pipelines/datacomp/components/detect_text/src/main.py     | 1 -
 src/fondant/data_io.py                                        | 3 ---
 4 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile
index 1430c6972..d6059a33a 100644
--- a/examples/pipelines/datacomp/components/detect_text/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile
@@ -1,4 +1,4 @@
-FROM --platform=linux/amd64 python:3.8-slim
+FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
 
 # System dependencies
 RUN apt-get update && \
@@ -6,7 +6,7 @@ RUN apt-get update && \
     apt-get install git -y
 
 # Install requirements
-COPY requirements.txt /
+COPY requirements.txt ./
 RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 54aaffd6b..520bda66a 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:dcc714e12374928e893b2fd6865b151923f0a791
+image: ghcr.io/ml6team/detect_text:fa341a58a463c825f90198a42adbe40ce1cc600a
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index 8d0d8fe0b..e4e0a4d48 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -117,7 +117,6 @@ def __init__(self, *args) -> None:
         craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model")
         logger.info(f"Device: {ort.get_device()}")
         providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider']
-        providers = ['CPUExecutionProvider']
         self.session = ort.InferenceSession(craft_onnx, providers=providers)
 
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 1f109a96c..4fa442747 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -93,9 +93,6 @@ def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame:
 
         subset_df = dd.read_parquet(remote_path, columns=fields)
 
-        logger.info(f"First few rows of subset {subset_name}:")
-        print(subset_df.head())
-
         # add subset prefix to columns
         subset_df = subset_df.rename(
             columns={col: subset_name + "_" + col for col in subset_df.columns},

From 7ec067b0d610d9658ba2dffe0ac33475926840bb Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Thu, 3 Aug 2023 20:02:41 +0200
Subject: [PATCH 56/65] More improvements

---
 examples/pipelines/datacomp/components/detect_text/Dockerfile   | 2 +-
 .../datacomp/components/detect_text/fondant_component.yaml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile
index d6059a33a..e37002456 100644
--- a/examples/pipelines/datacomp/components/detect_text/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=1bfe2593b5058b663b608f21740425b482c30b5c
+ARG FONDANT_VERSION=dada0106b4a39ad53025704541231d350ee262a2
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 520bda66a..1caae76a2 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:fa341a58a463c825f90198a42adbe40ce1cc600a
+image: ghcr.io/ml6team/detect_text:dada0106b4a39ad53025704541231d350ee262a2
 
 consumes:
   image:

From ac2a130dd070d538d6f9a16855494261a012dded Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 4 Aug 2023 10:24:55 +0200
Subject: [PATCH 57/65] More improvements

---
 .../datacomp/components/detect_text/fondant_component.yaml  | 2 +-
 .../pipelines/datacomp/components/detect_text/src/main.py   | 6 ------
 .../datacomp/components/download_images/Dockerfile          | 2 +-
 .../components/download_images/fondant_component.yaml       | 2 +-
 scripts/build_components.sh                                 | 1 +
 5 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 1caae76a2..aafc45523 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:dada0106b4a39ad53025704541231d350ee262a2
+image: ghcr.io/ml6team/detect_text:7ec067b0d610d9658ba2dffe0ac33475926840bb
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index e4e0a4d48..ecb7e7655 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -121,8 +121,6 @@ def __init__(self, *args) -> None:
 
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
 
-        logger.info(f"Length of the dataframe: {len(dataframe)}")
-
         # create meta
         # needs to be a dictionary with keys = column names, values = dtypes of columns
         # for each column in the output
@@ -136,10 +134,6 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
             meta=meta,
         )
 
-        logger.info(f"Length of the final dataframe: {len(dataframe)}")
-        print("Columns of the final dataframe", dataframe.columns)
-        print("First rows of final dataframe:", dataframe.head())
-
         return dataframe
 
 
diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile
index 787f89bb4..c5c2a7767 100644
--- a/examples/pipelines/datacomp/components/download_images/Dockerfile
+++ b/examples/pipelines/datacomp/components/download_images/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=7db2865958ff18a3aec6aafbb3374c2a70a6b8f5
+ARG FONDANT_VERSION=7ec067b0d610d9658ba2dffe0ac33475926840bb
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 620984e9e..e988ebe46 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:22de5f60ae89d6f6efc56a6d1002e147cab75fd2
+image: ghcr.io/ml6team/download_images:7ec067b0d610d9658ba2dffe0ac33475926840bb
 
 consumes:
   image:
diff --git a/scripts/build_components.sh b/scripts/build_components.sh
index 265d08b83..178acba99 100755
--- a/scripts/build_components.sh
+++ b/scripts/build_components.sh
@@ -97,6 +97,7 @@ for dir in "${components_to_build[@]}"; do
   docker build --push "${args[@]}" \
    --build-arg="FONDANT_VERSION=${tags[0]}" \
    --label org.opencontainers.image.source=https://github.com/${namespace}/${repo} \
+   --platform=linux/amd64 \
    .
 
   popd

From 3d433d9e8dfeba967236445657dd8f415726de9a Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 4 Aug 2023 10:40:55 +0200
Subject: [PATCH 58/65] More improvements

---
 examples/pipelines/datacomp/components/detect_text/Dockerfile   | 2 +-
 .../datacomp/components/detect_text/fondant_component.yaml      | 2 +-
 examples/pipelines/datacomp/components/detect_text/src/main.py  | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile
index e37002456..b52d9a008 100644
--- a/examples/pipelines/datacomp/components/detect_text/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile
@@ -1,4 +1,4 @@
-FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+FROM --platform=linux/amd64 python:3.8-slim
 
 # System dependencies
 RUN apt-get update && \
diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index aafc45523..bf82cf094 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:7ec067b0d610d9658ba2dffe0ac33475926840bb
+image: ghcr.io/ml6team/detect_text:ac2a130dd070d538d6f9a16855494261a012dded
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index ecb7e7655..bfcf79529 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -117,6 +117,7 @@ def __init__(self, *args) -> None:
         craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model")
         logger.info(f"Device: {ort.get_device()}")
         providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider']
+        providers = ['CPUExecutionProvider']
         self.session = ort.InferenceSession(craft_onnx, providers=providers)
 
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:

From cc418db0964ee202ebf2d4fd3aaee15604604244 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Fri, 4 Aug 2023 14:59:11 +0200
Subject: [PATCH 59/65] Simplify requirements

---
 .../detect_text/fondant_component.yaml        |   2 +-
 .../components/detect_text/requirements.txt   |   8 +-
 .../detect_text/src/easyocr_utils.py          | 366 ++++++++++++++++++
 .../components/detect_text/src/main.py        |   6 +-
 .../components/download_images/Dockerfile     |   2 +-
 .../download_images/fondant_component.yaml    |   2 +-
 examples/pipelines/datacomp/pipeline.py       |   6 +-
 7 files changed, 378 insertions(+), 14 deletions(-)
 create mode 100644 examples/pipelines/datacomp/components/detect_text/src/easyocr_utils.py

diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index bf82cf094..7a6579aa0 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:ac2a130dd070d538d6f9a16855494261a012dded
+image: ghcr.io/ml6team/detect_text:3d433d9e8dfeba967236445657dd8f415726de9a
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/requirements.txt b/examples/pipelines/datacomp/components/detect_text/requirements.txt
index 16a68c71b..62fc9dd06 100644
--- a/examples/pipelines/datacomp/components/detect_text/requirements.txt
+++ b/examples/pipelines/datacomp/components/detect_text/requirements.txt
@@ -1,5 +1,5 @@
 huggingface-hub==0.16.4
-easyocr==1.7.0
-onnxruntime-gpu==1.15.1
-Pillow==10.0.0
-torch==2.0.1
\ No newline at end of file
+onnxruntime==1.15.1
+torch==2.0.1
+opencv-python-headless
+scipy
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text/src/easyocr_utils.py b/examples/pipelines/datacomp/components/detect_text/src/easyocr_utils.py
new file mode 100644
index 000000000..4feb67768
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text/src/easyocr_utils.py
@@ -0,0 +1,366 @@
+"""  
+Copyright (c) 2019-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+import cv2
+import math
+from scipy.ndimage import label
+
+""" auxiliary functions """
+# unwarp corodinates
+def warpCoord(Minv, pt):
+    out = np.matmul(Minv, (pt[0], pt[1], 1))
+    return np.array([out[0]/out[2], out[1]/out[2]])
+""" end of auxiliary functions """
+
+
+def getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text, estimate_num_chars=False):
+    # prepare data
+    linkmap = linkmap.copy()
+    textmap = textmap.copy()
+    img_h, img_w = textmap.shape
+
+    """ labeling method """
+    ret, text_score = cv2.threshold(textmap, low_text, 1, 0)
+    ret, link_score = cv2.threshold(linkmap, link_threshold, 1, 0)
+
+    text_score_comb = np.clip(text_score + link_score, 0, 1)
+    nLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(text_score_comb.astype(np.uint8), connectivity=4)
+
+    det = []
+    mapper = []
+    for k in range(1,nLabels):
+        # size filtering
+        size = stats[k, cv2.CC_STAT_AREA]
+        if size < 10: continue
+
+        # thresholding
+        if np.max(textmap[labels==k]) < text_threshold: continue
+
+        # make segmentation map
+        segmap = np.zeros(textmap.shape, dtype=np.uint8)
+        segmap[labels==k] = 255
+        if estimate_num_chars:
+            _, character_locs = cv2.threshold((textmap - linkmap) * segmap /255., text_threshold, 1, 0)
+            _, n_chars = label(character_locs)
+            mapper.append(n_chars)
+        else:
+            mapper.append(k)
+        segmap[np.logical_and(link_score==1, text_score==0)] = 0   # remove link area
+        x, y = stats[k, cv2.CC_STAT_LEFT], stats[k, cv2.CC_STAT_TOP]
+        w, h = stats[k, cv2.CC_STAT_WIDTH], stats[k, cv2.CC_STAT_HEIGHT]
+        niter = int(math.sqrt(size * min(w, h) / (w * h)) * 2)
+        sx, ex, sy, ey = x - niter, x + w + niter + 1, y - niter, y + h + niter + 1
+        # boundary check
+        if sx < 0 : sx = 0
+        if sy < 0 : sy = 0
+        if ex >= img_w: ex = img_w
+        if ey >= img_h: ey = img_h
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(1 + niter, 1 + niter))
+        segmap[sy:ey, sx:ex] = cv2.dilate(segmap[sy:ey, sx:ex], kernel)
+
+        # make box
+        np_contours = np.roll(np.array(np.where(segmap!=0)),1,axis=0).transpose().reshape(-1,2)
+        rectangle = cv2.minAreaRect(np_contours)
+        box = cv2.boxPoints(rectangle)
+
+        # align diamond-shape
+        w, h = np.linalg.norm(box[0] - box[1]), np.linalg.norm(box[1] - box[2])
+        box_ratio = max(w, h) / (min(w, h) + 1e-5)
+        if abs(1 - box_ratio) <= 0.1:
+            l, r = min(np_contours[:,0]), max(np_contours[:,0])
+            t, b = min(np_contours[:,1]), max(np_contours[:,1])
+            box = np.array([[l, t], [r, t], [r, b], [l, b]], dtype=np.float32)
+
+        # make clock-wise order
+        startidx = box.sum(axis=1).argmin()
+        box = np.roll(box, 4-startidx, 0)
+        box = np.array(box)
+
+        det.append(box)
+
+    return det, labels, mapper
+
+def getPoly_core(boxes, labels, mapper, linkmap):
+    # configs
+    num_cp = 5
+    max_len_ratio = 0.7
+    expand_ratio = 1.45
+    max_r = 2.0
+    step_r = 0.2
+
+    polys = []  
+    for k, box in enumerate(boxes):
+        # size filter for small instance
+        w, h = int(np.linalg.norm(box[0] - box[1]) + 1), int(np.linalg.norm(box[1] - box[2]) + 1)
+        if w < 10 or h < 10:
+            polys.append(None); continue
+
+        # warp image
+        tar = np.float32([[0,0],[w,0],[w,h],[0,h]])
+        M = cv2.getPerspectiveTransform(box, tar)
+        word_label = cv2.warpPerspective(labels, M, (w, h), flags=cv2.INTER_NEAREST)
+        try:
+            Minv = np.linalg.inv(M)
+        except:
+            polys.append(None); continue
+
+        # binarization for selected label
+        cur_label = mapper[k]
+        word_label[word_label != cur_label] = 0
+        word_label[word_label > 0] = 1
+
+        """ Polygon generation """
+        # find top/bottom contours
+        cp = []
+        max_len = -1
+        for i in range(w):
+            region = np.where(word_label[:,i] != 0)[0]
+            if len(region) < 2 : continue
+            cp.append((i, region[0], region[-1]))
+            length = region[-1] - region[0] + 1
+            if length > max_len: max_len = length
+
+        # pass if max_len is similar to h
+        if h * max_len_ratio < max_len:
+            polys.append(None); continue
+
+        # get pivot points with fixed length
+        tot_seg = num_cp * 2 + 1
+        seg_w = w / tot_seg     # segment width
+        pp = [None] * num_cp    # init pivot points
+        cp_section = [[0, 0]] * tot_seg
+        seg_height = [0] * num_cp
+        seg_num = 0
+        num_sec = 0
+        prev_h = -1
+        for i in range(0,len(cp)):
+            (x, sy, ey) = cp[i]
+            if (seg_num + 1) * seg_w <= x and seg_num <= tot_seg:
+                # average previous segment
+                if num_sec == 0: break
+                cp_section[seg_num] = [cp_section[seg_num][0] / num_sec, cp_section[seg_num][1] / num_sec]
+                num_sec = 0
+
+                # reset variables
+                seg_num += 1
+                prev_h = -1
+
+            # accumulate center points
+            cy = (sy + ey) * 0.5
+            cur_h = ey - sy + 1
+            cp_section[seg_num] = [cp_section[seg_num][0] + x, cp_section[seg_num][1] + cy]
+            num_sec += 1
+
+            if seg_num % 2 == 0: continue # No polygon area
+
+            if prev_h < cur_h:
+                pp[int((seg_num - 1)/2)] = (x, cy)
+                seg_height[int((seg_num - 1)/2)] = cur_h
+                prev_h = cur_h
+
+        # processing last segment
+        if num_sec != 0:
+            cp_section[-1] = [cp_section[-1][0] / num_sec, cp_section[-1][1] / num_sec]
+
+        # pass if num of pivots is not sufficient or segment width is smaller than character height 
+        if None in pp or seg_w < np.max(seg_height) * 0.25:
+            polys.append(None); continue
+
+        # calc median maximum of pivot points
+        half_char_h = np.median(seg_height) * expand_ratio / 2
+
+        # calc gradiant and apply to make horizontal pivots
+        new_pp = []
+        for i, (x, cy) in enumerate(pp):
+            dx = cp_section[i * 2 + 2][0] - cp_section[i * 2][0]
+            dy = cp_section[i * 2 + 2][1] - cp_section[i * 2][1]
+            if dx == 0:     # gradient if zero
+                new_pp.append([x, cy - half_char_h, x, cy + half_char_h])
+                continue
+            rad = - math.atan2(dy, dx)
+            c, s = half_char_h * math.cos(rad), half_char_h * math.sin(rad)
+            new_pp.append([x - s, cy - c, x + s, cy + c])
+
+        # get edge points to cover character heatmaps
+        isSppFound, isEppFound = False, False
+        grad_s = (pp[1][1] - pp[0][1]) / (pp[1][0] - pp[0][0]) + (pp[2][1] - pp[1][1]) / (pp[2][0] - pp[1][0])
+        grad_e = (pp[-2][1] - pp[-1][1]) / (pp[-2][0] - pp[-1][0]) + (pp[-3][1] - pp[-2][1]) / (pp[-3][0] - pp[-2][0])
+        for r in np.arange(0.5, max_r, step_r):
+            dx = 2 * half_char_h * r
+            if not isSppFound:
+                line_img = np.zeros(word_label.shape, dtype=np.uint8)
+                dy = grad_s * dx
+                p = np.array(new_pp[0]) - np.array([dx, dy, dx, dy])
+                cv2.line(line_img, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), 1, thickness=1)
+                if np.sum(np.logical_and(word_label, line_img)) == 0 or r + 2 * step_r >= max_r:
+                    spp = p
+                    isSppFound = True
+            if not isEppFound:
+                line_img = np.zeros(word_label.shape, dtype=np.uint8)
+                dy = grad_e * dx
+                p = np.array(new_pp[-1]) + np.array([dx, dy, dx, dy])
+                cv2.line(line_img, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), 1, thickness=1)
+                if np.sum(np.logical_and(word_label, line_img)) == 0 or r + 2 * step_r >= max_r:
+                    epp = p
+                    isEppFound = True
+            if isSppFound and isEppFound:
+                break
+
+        # pass if boundary of polygon is not found
+        if not (isSppFound and isEppFound):
+            polys.append(None); continue
+
+        # make final polygon
+        poly = []
+        poly.append(warpCoord(Minv, (spp[0], spp[1])))
+        for p in new_pp:
+            poly.append(warpCoord(Minv, (p[0], p[1])))
+        poly.append(warpCoord(Minv, (epp[0], epp[1])))
+        poly.append(warpCoord(Minv, (epp[2], epp[3])))
+        for p in reversed(new_pp):
+            poly.append(warpCoord(Minv, (p[2], p[3])))
+        poly.append(warpCoord(Minv, (spp[2], spp[3])))
+
+        # add to final result
+        polys.append(np.array(poly))
+
+    return polys
+
+def getDetBoxes(textmap, linkmap, text_threshold, link_threshold, low_text, poly=False, estimate_num_chars=False):
+    if poly and estimate_num_chars:
+        raise Exception("Estimating the number of characters not currently supported with poly.")
+    boxes, labels, mapper = getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text, estimate_num_chars)
+
+    if poly:
+        polys = getPoly_core(boxes, labels, mapper, linkmap)
+    else:
+        polys = [None] * len(boxes)
+
+    return boxes, polys, mapper
+
+def adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net = 2):
+    if len(polys) > 0:
+        polys = np.array(polys)
+        for k in range(len(polys)):
+            if polys[k] is not None:
+                polys[k] *= (ratio_w * ratio_net, ratio_h * ratio_net)
+    return polys
+
+
+def normalizeMeanVariance(in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)):
+    # should be RGB order
+    img = in_img.copy().astype(np.float32)
+
+    img -= np.array([mean[0] * 255.0, mean[1] * 255.0, mean[2] * 255.0], dtype=np.float32)
+    img /= np.array([variance[0] * 255.0, variance[1] * 255.0, variance[2] * 255.0], dtype=np.float32)
+    return img
+
+
+def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, width_ths = 1.0, add_margin = 0.05, sort_output = True):
+    # poly top-left, top-right, low-right, low-left
+    horizontal_list, free_list,combined_list, merged_list = [],[],[],[]
+
+    for poly in polys:
+        slope_up = (poly[3]-poly[1])/np.maximum(10, (poly[2]-poly[0]))
+        slope_down = (poly[5]-poly[7])/np.maximum(10, (poly[4]-poly[6]))
+        if max(abs(slope_up), abs(slope_down)) < slope_ths:
+            x_max = max([poly[0],poly[2],poly[4],poly[6]])
+            x_min = min([poly[0],poly[2],poly[4],poly[6]])
+            y_max = max([poly[1],poly[3],poly[5],poly[7]])
+            y_min = min([poly[1],poly[3],poly[5],poly[7]])
+            horizontal_list.append([x_min, x_max, y_min, y_max, 0.5*(y_min+y_max), y_max-y_min])
+        else:
+            height = np.linalg.norm([poly[6]-poly[0],poly[7]-poly[1]])
+            width = np.linalg.norm([poly[2]-poly[0],poly[3]-poly[1]])
+
+            margin = int(1.44*add_margin*min(width, height))
+
+            theta13 = abs(np.arctan( (poly[1]-poly[5])/np.maximum(10, (poly[0]-poly[4]))))
+            theta24 = abs(np.arctan( (poly[3]-poly[7])/np.maximum(10, (poly[2]-poly[6]))))
+            # do I need to clip minimum, maximum value here?
+            x1 = poly[0] - np.cos(theta13)*margin
+            y1 = poly[1] - np.sin(theta13)*margin
+            x2 = poly[2] + np.cos(theta24)*margin
+            y2 = poly[3] - np.sin(theta24)*margin
+            x3 = poly[4] + np.cos(theta13)*margin
+            y3 = poly[5] + np.sin(theta13)*margin
+            x4 = poly[6] - np.cos(theta24)*margin
+            y4 = poly[7] + np.sin(theta24)*margin
+
+            free_list.append([[x1,y1],[x2,y2],[x3,y3],[x4,y4]])
+    if sort_output:
+        horizontal_list = sorted(horizontal_list, key=lambda item: item[4])
+
+    # combine box
+    new_box = []
+    for poly in horizontal_list:
+
+        if len(new_box) == 0:
+            b_height = [poly[5]]
+            b_ycenter = [poly[4]]
+            new_box.append(poly)
+        else:
+            # comparable height and comparable y_center level up to ths*height
+            if abs(np.mean(b_ycenter) - poly[4]) < ycenter_ths*np.mean(b_height):
+                b_height.append(poly[5])
+                b_ycenter.append(poly[4])
+                new_box.append(poly)
+            else:
+                b_height = [poly[5]]
+                b_ycenter = [poly[4]]
+                combined_list.append(new_box)
+                new_box = [poly]
+    combined_list.append(new_box)
+
+    # merge list use sort again
+    for boxes in combined_list:
+        if len(boxes) == 1: # one box per line
+            box = boxes[0]
+            margin = int(add_margin*min(box[1]-box[0],box[5]))
+            merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin])
+        else: # multiple boxes per line
+            boxes = sorted(boxes, key=lambda item: item[0])
+
+            merged_box, new_box = [],[]
+            for box in boxes:
+                if len(new_box) == 0:
+                    b_height = [box[5]]
+                    x_max = box[1]
+                    new_box.append(box)
+                else:
+                    if (abs(np.mean(b_height) - box[5]) < height_ths*np.mean(b_height)) and ((box[0]-x_max) < width_ths *(box[3]-box[2])): # merge boxes
+                        b_height.append(box[5])
+                        x_max = box[1]
+                        new_box.append(box)
+                    else:
+                        b_height = [box[5]]
+                        x_max = box[1]
+                        merged_box.append(new_box)
+                        new_box = [box]
+            if len(new_box) >0: merged_box.append(new_box)
+
+            for mbox in merged_box:
+                if len(mbox) != 1: # adjacent box in same line
+                    # do I need to add margin here?
+                    x_min = min(mbox, key=lambda x: x[0])[0]
+                    x_max = max(mbox, key=lambda x: x[1])[1]
+                    y_min = min(mbox, key=lambda x: x[2])[2]
+                    y_max = max(mbox, key=lambda x: x[3])[3]
+
+                    box_width = x_max - x_min
+                    box_height = y_max - y_min
+                    margin = int(add_margin * (min(box_width, box_height)))
+
+                    merged_list.append([x_min-margin, x_max+margin, y_min-margin, y_max+margin])
+                else: # non adjacent box in same line
+                    box = mbox[0]
+
+                    box_width = box[1] - box[0]
+                    box_height = box[3] - box[2]
+                    margin = int(add_margin * (min(box_width, box_height)))
+
+                    merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin])
+    # may need to check if box is really in image
+    return merged_list, free_list
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index bfcf79529..a440b9c66 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -9,9 +9,7 @@
 
 from huggingface_hub import hf_hub_download
 
-from easyocr.craft_utils import getDetBoxes, adjustResultCoordinates
-from easyocr.imgproc import normalizeMeanVariance
-from easyocr.utils import group_text_box
+from easyocr_utils import getDetBoxes, adjustResultCoordinates, normalizeMeanVariance, group_text_box
 
 import torch
 import onnxruntime as ort
@@ -58,7 +56,7 @@ def get_boxes(image_data, session):
       image = Image.open(io.BytesIO(image_data)).convert("RGB")
       image = np.array(image)
     except:
-      return [None]
+      return []
 
     # Use Pillow instead of cv2
     img_resized, target_ratio, size_heatmap = resize_aspect_ratio_pillow(img=image,
diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile
index c5c2a7767..5ff146228 100644
--- a/examples/pipelines/datacomp/components/download_images/Dockerfile
+++ b/examples/pipelines/datacomp/components/download_images/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=7ec067b0d610d9658ba2dffe0ac33475926840bb
+ARG FONDANT_VERSION=3d433d9e8dfeba967236445657dd8f415726de9a
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index e988ebe46..9f4e2b3df 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:7ec067b0d610d9658ba2dffe0ac33475926840bb
+image: ghcr.io/ml6team/download_images:3d433d9e8dfeba967236445657dd8f415726de9a
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index dfe6f9f4f..b2b609961 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    base_path=PipelineConfigs.BASE_PATH,
-    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    # base_path=PipelineConfigs.BASE_PATH,
+    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -90,7 +90,7 @@
 # pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
 # pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 pipeline.add_op(download_images_op, dependencies=load_from_hub_op)
-pipeline.add_op(detect_text_op, dependencies=download_images_op)
+# pipeline.add_op(detect_text_op, dependencies=download_images_op)
 # pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
 

From fff8ca0c67f09a753fc08d8cfa13e73ab0b66890 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Sun, 6 Aug 2023 09:52:39 +0200
Subject: [PATCH 60/65] More improvements

---
 .../components/detect_text/Dockerfile         |   3 +-
 .../detect_text/fondant_component.yaml        |   2 +-
 .../components/detect_text/requirements.txt   |   1 -
 .../components/detect_text/src/main.py        |   2 +-
 .../components/detect_text_gpu/Dockerfile     |  23 ++
 .../detect_text_gpu/fondant_component.yaml    |  21 +
 .../detect_text_gpu/requirements.txt          |   4 +
 .../detect_text_gpu/src/easyocr_utils.py      | 366 ++++++++++++++++++
 .../components/detect_text_gpu/src/main.py    | 140 +++++++
 .../detect_text_torch_gpu/Dockerfile          |  23 ++
 .../fondant_component.yaml                    |  21 +
 .../detect_text_torch_gpu/requirements.txt    |   4 +
 .../detect_text_torch_gpu/src/main.py         | 142 +++++++
 .../download_images/fondant_component.yaml    |   2 +-
 .../components/download_images/src/main.py    |   1 +
 .../datacomp/components/dummy/Dockerfile      |  23 ++
 .../components/dummy/fondant_component.yaml   |   9 +
 .../components/dummy/requirements.txt         |   0
 .../datacomp/components/dummy/src/main.py     |  26 ++
 examples/pipelines/datacomp/pipeline.py       |  31 +-
 20 files changed, 826 insertions(+), 18 deletions(-)
 create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/Dockerfile
 create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/fondant_component.yaml
 create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/requirements.txt
 create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/src/easyocr_utils.py
 create mode 100644 examples/pipelines/datacomp/components/detect_text_gpu/src/main.py
 create mode 100644 examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
 create mode 100644 examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
 create mode 100644 examples/pipelines/datacomp/components/detect_text_torch_gpu/requirements.txt
 create mode 100644 examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
 create mode 100644 examples/pipelines/datacomp/components/dummy/Dockerfile
 create mode 100644 examples/pipelines/datacomp/components/dummy/fondant_component.yaml
 create mode 100644 examples/pipelines/datacomp/components/dummy/requirements.txt
 create mode 100644 examples/pipelines/datacomp/components/dummy/src/main.py

diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile
index b52d9a008..ad09d730e 100644
--- a/examples/pipelines/datacomp/components/detect_text/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile
@@ -8,10 +8,11 @@ RUN apt-get update && \
 # Install requirements
 COPY requirements.txt ./
 RUN pip3 install --no-cache-dir -r requirements.txt
+RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=dada0106b4a39ad53025704541231d350ee262a2
+ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 7a6579aa0..357feb55f 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:3d433d9e8dfeba967236445657dd8f415726de9a
+image: ghcr.io/ml6team/detect_text:cc418db0964ee202ebf2d4fd3aaee15604604244
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text/requirements.txt b/examples/pipelines/datacomp/components/detect_text/requirements.txt
index 62fc9dd06..823417f9a 100644
--- a/examples/pipelines/datacomp/components/detect_text/requirements.txt
+++ b/examples/pipelines/datacomp/components/detect_text/requirements.txt
@@ -1,5 +1,4 @@
 huggingface-hub==0.16.4
 onnxruntime==1.15.1
-torch==2.0.1
 opencv-python-headless
 scipy
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text/src/main.py b/examples/pipelines/datacomp/components/detect_text/src/main.py
index a440b9c66..709017ffb 100644
--- a/examples/pipelines/datacomp/components/detect_text/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text/src/main.py
@@ -34,7 +34,7 @@ def resize_aspect_ratio_pillow(img, square_size, mag_ratio=1):
 
     target_h, target_w = int(height * ratio), int(width * ratio)
     img = Image.fromarray(img)
-    proc = img.resize((target_w, target_h), resample = Image.BILINEAR)
+    proc = img.resize((target_w, target_h), resample = Image.Resampling.BILINEAR)
 
     # make canvas and paste image
     target_h32, target_w32 = target_h, target_w
diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_gpu/Dockerfile
new file mode 100644
index 000000000..ed1861562
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_gpu/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt ./
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_gpu/fondant_component.yaml
new file mode 100644
index 000000000..9dfb4d69a
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_gpu/fondant_component.yaml
@@ -0,0 +1,21 @@
+name: Detect text
+description: Component that detects text in images
+image: ghcr.io/ml6team/detect_text_gpu:cc418db0964ee202ebf2d4fd3aaee15604604244
+
+consumes:
+  image:
+    fields:
+      data:
+        type: binary
+
+produces:
+  image:
+    fields:
+      data:
+        type: binary
+      boxes:
+        type: array
+        items:
+          type: array
+          items:
+            type: int64
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/requirements.txt b/examples/pipelines/datacomp/components/detect_text_gpu/requirements.txt
new file mode 100644
index 000000000..ee7989a30
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_gpu/requirements.txt
@@ -0,0 +1,4 @@
+huggingface-hub==0.16.4
+onnxruntime-gpu==1.15.1
+opencv-python-headless
+scipy
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/src/easyocr_utils.py b/examples/pipelines/datacomp/components/detect_text_gpu/src/easyocr_utils.py
new file mode 100644
index 000000000..4feb67768
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_gpu/src/easyocr_utils.py
@@ -0,0 +1,366 @@
+"""  
+Copyright (c) 2019-present NAVER Corp.
+MIT License
+"""
+import numpy as np
+import cv2
+import math
+from scipy.ndimage import label
+
+""" auxiliary functions """
+# unwarp corodinates
+def warpCoord(Minv, pt):
+    out = np.matmul(Minv, (pt[0], pt[1], 1))
+    return np.array([out[0]/out[2], out[1]/out[2]])
+""" end of auxiliary functions """
+
+
+def getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text, estimate_num_chars=False):
+    # prepare data
+    linkmap = linkmap.copy()
+    textmap = textmap.copy()
+    img_h, img_w = textmap.shape
+
+    """ labeling method """
+    ret, text_score = cv2.threshold(textmap, low_text, 1, 0)
+    ret, link_score = cv2.threshold(linkmap, link_threshold, 1, 0)
+
+    text_score_comb = np.clip(text_score + link_score, 0, 1)
+    nLabels, labels, stats, centroids = cv2.connectedComponentsWithStats(text_score_comb.astype(np.uint8), connectivity=4)
+
+    det = []
+    mapper = []
+    for k in range(1,nLabels):
+        # size filtering
+        size = stats[k, cv2.CC_STAT_AREA]
+        if size < 10: continue
+
+        # thresholding
+        if np.max(textmap[labels==k]) < text_threshold: continue
+
+        # make segmentation map
+        segmap = np.zeros(textmap.shape, dtype=np.uint8)
+        segmap[labels==k] = 255
+        if estimate_num_chars:
+            _, character_locs = cv2.threshold((textmap - linkmap) * segmap /255., text_threshold, 1, 0)
+            _, n_chars = label(character_locs)
+            mapper.append(n_chars)
+        else:
+            mapper.append(k)
+        segmap[np.logical_and(link_score==1, text_score==0)] = 0   # remove link area
+        x, y = stats[k, cv2.CC_STAT_LEFT], stats[k, cv2.CC_STAT_TOP]
+        w, h = stats[k, cv2.CC_STAT_WIDTH], stats[k, cv2.CC_STAT_HEIGHT]
+        niter = int(math.sqrt(size * min(w, h) / (w * h)) * 2)
+        sx, ex, sy, ey = x - niter, x + w + niter + 1, y - niter, y + h + niter + 1
+        # boundary check
+        if sx < 0 : sx = 0
+        if sy < 0 : sy = 0
+        if ex >= img_w: ex = img_w
+        if ey >= img_h: ey = img_h
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(1 + niter, 1 + niter))
+        segmap[sy:ey, sx:ex] = cv2.dilate(segmap[sy:ey, sx:ex], kernel)
+
+        # make box
+        np_contours = np.roll(np.array(np.where(segmap!=0)),1,axis=0).transpose().reshape(-1,2)
+        rectangle = cv2.minAreaRect(np_contours)
+        box = cv2.boxPoints(rectangle)
+
+        # align diamond-shape
+        w, h = np.linalg.norm(box[0] - box[1]), np.linalg.norm(box[1] - box[2])
+        box_ratio = max(w, h) / (min(w, h) + 1e-5)
+        if abs(1 - box_ratio) <= 0.1:
+            l, r = min(np_contours[:,0]), max(np_contours[:,0])
+            t, b = min(np_contours[:,1]), max(np_contours[:,1])
+            box = np.array([[l, t], [r, t], [r, b], [l, b]], dtype=np.float32)
+
+        # make clock-wise order
+        startidx = box.sum(axis=1).argmin()
+        box = np.roll(box, 4-startidx, 0)
+        box = np.array(box)
+
+        det.append(box)
+
+    return det, labels, mapper
+
+def getPoly_core(boxes, labels, mapper, linkmap):
+    # configs
+    num_cp = 5
+    max_len_ratio = 0.7
+    expand_ratio = 1.45
+    max_r = 2.0
+    step_r = 0.2
+
+    polys = []  
+    for k, box in enumerate(boxes):
+        # size filter for small instance
+        w, h = int(np.linalg.norm(box[0] - box[1]) + 1), int(np.linalg.norm(box[1] - box[2]) + 1)
+        if w < 10 or h < 10:
+            polys.append(None); continue
+
+        # warp image
+        tar = np.float32([[0,0],[w,0],[w,h],[0,h]])
+        M = cv2.getPerspectiveTransform(box, tar)
+        word_label = cv2.warpPerspective(labels, M, (w, h), flags=cv2.INTER_NEAREST)
+        try:
+            Minv = np.linalg.inv(M)
+        except:
+            polys.append(None); continue
+
+        # binarization for selected label
+        cur_label = mapper[k]
+        word_label[word_label != cur_label] = 0
+        word_label[word_label > 0] = 1
+
+        """ Polygon generation """
+        # find top/bottom contours
+        cp = []
+        max_len = -1
+        for i in range(w):
+            region = np.where(word_label[:,i] != 0)[0]
+            if len(region) < 2 : continue
+            cp.append((i, region[0], region[-1]))
+            length = region[-1] - region[0] + 1
+            if length > max_len: max_len = length
+
+        # pass if max_len is similar to h
+        if h * max_len_ratio < max_len:
+            polys.append(None); continue
+
+        # get pivot points with fixed length
+        tot_seg = num_cp * 2 + 1
+        seg_w = w / tot_seg     # segment width
+        pp = [None] * num_cp    # init pivot points
+        cp_section = [[0, 0]] * tot_seg
+        seg_height = [0] * num_cp
+        seg_num = 0
+        num_sec = 0
+        prev_h = -1
+        for i in range(0,len(cp)):
+            (x, sy, ey) = cp[i]
+            if (seg_num + 1) * seg_w <= x and seg_num <= tot_seg:
+                # average previous segment
+                if num_sec == 0: break
+                cp_section[seg_num] = [cp_section[seg_num][0] / num_sec, cp_section[seg_num][1] / num_sec]
+                num_sec = 0
+
+                # reset variables
+                seg_num += 1
+                prev_h = -1
+
+            # accumulate center points
+            cy = (sy + ey) * 0.5
+            cur_h = ey - sy + 1
+            cp_section[seg_num] = [cp_section[seg_num][0] + x, cp_section[seg_num][1] + cy]
+            num_sec += 1
+
+            if seg_num % 2 == 0: continue # No polygon area
+
+            if prev_h < cur_h:
+                pp[int((seg_num - 1)/2)] = (x, cy)
+                seg_height[int((seg_num - 1)/2)] = cur_h
+                prev_h = cur_h
+
+        # processing last segment
+        if num_sec != 0:
+            cp_section[-1] = [cp_section[-1][0] / num_sec, cp_section[-1][1] / num_sec]
+
+        # pass if num of pivots is not sufficient or segment width is smaller than character height 
+        if None in pp or seg_w < np.max(seg_height) * 0.25:
+            polys.append(None); continue
+
+        # calc median maximum of pivot points
+        half_char_h = np.median(seg_height) * expand_ratio / 2
+
+        # calc gradiant and apply to make horizontal pivots
+        new_pp = []
+        for i, (x, cy) in enumerate(pp):
+            dx = cp_section[i * 2 + 2][0] - cp_section[i * 2][0]
+            dy = cp_section[i * 2 + 2][1] - cp_section[i * 2][1]
+            if dx == 0:     # gradient if zero
+                new_pp.append([x, cy - half_char_h, x, cy + half_char_h])
+                continue
+            rad = - math.atan2(dy, dx)
+            c, s = half_char_h * math.cos(rad), half_char_h * math.sin(rad)
+            new_pp.append([x - s, cy - c, x + s, cy + c])
+
+        # get edge points to cover character heatmaps
+        isSppFound, isEppFound = False, False
+        grad_s = (pp[1][1] - pp[0][1]) / (pp[1][0] - pp[0][0]) + (pp[2][1] - pp[1][1]) / (pp[2][0] - pp[1][0])
+        grad_e = (pp[-2][1] - pp[-1][1]) / (pp[-2][0] - pp[-1][0]) + (pp[-3][1] - pp[-2][1]) / (pp[-3][0] - pp[-2][0])
+        for r in np.arange(0.5, max_r, step_r):
+            dx = 2 * half_char_h * r
+            if not isSppFound:
+                line_img = np.zeros(word_label.shape, dtype=np.uint8)
+                dy = grad_s * dx
+                p = np.array(new_pp[0]) - np.array([dx, dy, dx, dy])
+                cv2.line(line_img, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), 1, thickness=1)
+                if np.sum(np.logical_and(word_label, line_img)) == 0 or r + 2 * step_r >= max_r:
+                    spp = p
+                    isSppFound = True
+            if not isEppFound:
+                line_img = np.zeros(word_label.shape, dtype=np.uint8)
+                dy = grad_e * dx
+                p = np.array(new_pp[-1]) + np.array([dx, dy, dx, dy])
+                cv2.line(line_img, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), 1, thickness=1)
+                if np.sum(np.logical_and(word_label, line_img)) == 0 or r + 2 * step_r >= max_r:
+                    epp = p
+                    isEppFound = True
+            if isSppFound and isEppFound:
+                break
+
+        # pass if boundary of polygon is not found
+        if not (isSppFound and isEppFound):
+            polys.append(None); continue
+
+        # make final polygon
+        poly = []
+        poly.append(warpCoord(Minv, (spp[0], spp[1])))
+        for p in new_pp:
+            poly.append(warpCoord(Minv, (p[0], p[1])))
+        poly.append(warpCoord(Minv, (epp[0], epp[1])))
+        poly.append(warpCoord(Minv, (epp[2], epp[3])))
+        for p in reversed(new_pp):
+            poly.append(warpCoord(Minv, (p[2], p[3])))
+        poly.append(warpCoord(Minv, (spp[2], spp[3])))
+
+        # add to final result
+        polys.append(np.array(poly))
+
+    return polys
+
+def getDetBoxes(textmap, linkmap, text_threshold, link_threshold, low_text, poly=False, estimate_num_chars=False):
+    if poly and estimate_num_chars:
+        raise Exception("Estimating the number of characters not currently supported with poly.")
+    boxes, labels, mapper = getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text, estimate_num_chars)
+
+    if poly:
+        polys = getPoly_core(boxes, labels, mapper, linkmap)
+    else:
+        polys = [None] * len(boxes)
+
+    return boxes, polys, mapper
+
+def adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net = 2):
+    if len(polys) > 0:
+        polys = np.array(polys)
+        for k in range(len(polys)):
+            if polys[k] is not None:
+                polys[k] *= (ratio_w * ratio_net, ratio_h * ratio_net)
+    return polys
+
+
+def normalizeMeanVariance(in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)):
+    # should be RGB order
+    img = in_img.copy().astype(np.float32)
+
+    img -= np.array([mean[0] * 255.0, mean[1] * 255.0, mean[2] * 255.0], dtype=np.float32)
+    img /= np.array([variance[0] * 255.0, variance[1] * 255.0, variance[2] * 255.0], dtype=np.float32)
+    return img
+
+
+def group_text_box(polys, slope_ths = 0.1, ycenter_ths = 0.5, height_ths = 0.5, width_ths = 1.0, add_margin = 0.05, sort_output = True):
+    # poly top-left, top-right, low-right, low-left
+    horizontal_list, free_list,combined_list, merged_list = [],[],[],[]
+
+    for poly in polys:
+        slope_up = (poly[3]-poly[1])/np.maximum(10, (poly[2]-poly[0]))
+        slope_down = (poly[5]-poly[7])/np.maximum(10, (poly[4]-poly[6]))
+        if max(abs(slope_up), abs(slope_down)) < slope_ths:
+            x_max = max([poly[0],poly[2],poly[4],poly[6]])
+            x_min = min([poly[0],poly[2],poly[4],poly[6]])
+            y_max = max([poly[1],poly[3],poly[5],poly[7]])
+            y_min = min([poly[1],poly[3],poly[5],poly[7]])
+            horizontal_list.append([x_min, x_max, y_min, y_max, 0.5*(y_min+y_max), y_max-y_min])
+        else:
+            height = np.linalg.norm([poly[6]-poly[0],poly[7]-poly[1]])
+            width = np.linalg.norm([poly[2]-poly[0],poly[3]-poly[1]])
+
+            margin = int(1.44*add_margin*min(width, height))
+
+            theta13 = abs(np.arctan( (poly[1]-poly[5])/np.maximum(10, (poly[0]-poly[4]))))
+            theta24 = abs(np.arctan( (poly[3]-poly[7])/np.maximum(10, (poly[2]-poly[6]))))
+            # do I need to clip minimum, maximum value here?
+            x1 = poly[0] - np.cos(theta13)*margin
+            y1 = poly[1] - np.sin(theta13)*margin
+            x2 = poly[2] + np.cos(theta24)*margin
+            y2 = poly[3] - np.sin(theta24)*margin
+            x3 = poly[4] + np.cos(theta13)*margin
+            y3 = poly[5] + np.sin(theta13)*margin
+            x4 = poly[6] - np.cos(theta24)*margin
+            y4 = poly[7] + np.sin(theta24)*margin
+
+            free_list.append([[x1,y1],[x2,y2],[x3,y3],[x4,y4]])
+    if sort_output:
+        horizontal_list = sorted(horizontal_list, key=lambda item: item[4])
+
+    # combine box
+    new_box = []
+    for poly in horizontal_list:
+
+        if len(new_box) == 0:
+            b_height = [poly[5]]
+            b_ycenter = [poly[4]]
+            new_box.append(poly)
+        else:
+            # comparable height and comparable y_center level up to ths*height
+            if abs(np.mean(b_ycenter) - poly[4]) < ycenter_ths*np.mean(b_height):
+                b_height.append(poly[5])
+                b_ycenter.append(poly[4])
+                new_box.append(poly)
+            else:
+                b_height = [poly[5]]
+                b_ycenter = [poly[4]]
+                combined_list.append(new_box)
+                new_box = [poly]
+    combined_list.append(new_box)
+
+    # merge list use sort again
+    for boxes in combined_list:
+        if len(boxes) == 1: # one box per line
+            box = boxes[0]
+            margin = int(add_margin*min(box[1]-box[0],box[5]))
+            merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin])
+        else: # multiple boxes per line
+            boxes = sorted(boxes, key=lambda item: item[0])
+
+            merged_box, new_box = [],[]
+            for box in boxes:
+                if len(new_box) == 0:
+                    b_height = [box[5]]
+                    x_max = box[1]
+                    new_box.append(box)
+                else:
+                    if (abs(np.mean(b_height) - box[5]) < height_ths*np.mean(b_height)) and ((box[0]-x_max) < width_ths *(box[3]-box[2])): # merge boxes
+                        b_height.append(box[5])
+                        x_max = box[1]
+                        new_box.append(box)
+                    else:
+                        b_height = [box[5]]
+                        x_max = box[1]
+                        merged_box.append(new_box)
+                        new_box = [box]
+            if len(new_box) >0: merged_box.append(new_box)
+
+            for mbox in merged_box:
+                if len(mbox) != 1: # adjacent box in same line
+                    # do I need to add margin here?
+                    x_min = min(mbox, key=lambda x: x[0])[0]
+                    x_max = max(mbox, key=lambda x: x[1])[1]
+                    y_min = min(mbox, key=lambda x: x[2])[2]
+                    y_max = max(mbox, key=lambda x: x[3])[3]
+
+                    box_width = x_max - x_min
+                    box_height = y_max - y_min
+                    margin = int(add_margin * (min(box_width, box_height)))
+
+                    merged_list.append([x_min-margin, x_max+margin, y_min-margin, y_max+margin])
+                else: # non adjacent box in same line
+                    box = mbox[0]
+
+                    box_width = box[1] - box[0]
+                    box_height = box[3] - box[2]
+                    margin = int(add_margin * (min(box_width, box_height)))
+
+                    merged_list.append([box[0]-margin,box[1]+margin,box[2]-margin,box[3]+margin])
+    # may need to check if box is really in image
+    return merged_list, free_list
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text_gpu/src/main.py b/examples/pipelines/datacomp/components/detect_text_gpu/src/main.py
new file mode 100644
index 000000000..79519c21e
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_gpu/src/main.py
@@ -0,0 +1,140 @@
+"""This component detexts text in images, using CRAFT.
+"""
+import logging
+
+import dask.dataframe as dd
+import numpy as np
+import io
+from PIL import Image
+
+from huggingface_hub import hf_hub_download
+
+from easyocr_utils import getDetBoxes, adjustResultCoordinates, normalizeMeanVariance, group_text_box
+
+import torch
+import onnxruntime as ort
+
+from fondant.component import DaskTransformComponent
+from fondant.executor import DaskTransformExecutor
+
+logger = logging.getLogger(__name__)
+
+
+def resize_aspect_ratio_pillow(img, square_size, mag_ratio=1):
+    height, width, channel = img.shape
+
+    # magnify image size
+    target_size = mag_ratio * max(height, width)
+
+    # set original image size
+    if target_size > square_size:
+        target_size = square_size
+
+    ratio = target_size / max(height, width)
+
+    target_h, target_w = int(height * ratio), int(width * ratio)
+    img = Image.fromarray(img)
+    proc = img.resize((target_w, target_h), resample = Image.Resampling.BILINEAR)
+
+    # make canvas and paste image
+    target_h32, target_w32 = target_h, target_w
+    if target_h % 32 != 0:
+        target_h32 = target_h + (32 - target_h % 32)
+    if target_w % 32 != 0:
+        target_w32 = target_w + (32 - target_w % 32)
+    resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32)
+    resized[0:target_h, 0:target_w, :] = proc
+    target_h, target_w = target_h32, target_w32
+
+    size_heatmap = (int(target_w/2), int(target_h/2))
+
+    return resized, ratio, size_heatmap
+
+
+def get_boxes(image_data, session):
+    try:
+      image = Image.open(io.BytesIO(image_data)).convert("RGB")
+      image = np.array(image)
+    except:
+      return []
+
+    # Use Pillow instead of cv2
+    img_resized, target_ratio, size_heatmap = resize_aspect_ratio_pillow(img=image,
+                  square_size=512,
+                  mag_ratio=1.0)
+
+    ratio_h = ratio_w = 1 / target_ratio
+    x = normalizeMeanVariance(img_resized)
+    x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)
+
+    input_name = session.get_inputs()[0].name
+
+    # Prepare input tensor for inference
+    inp = {input_name: x.numpy()}
+
+    # Run inference and get output
+    y, _ = session.run(None, inp)
+
+    # Extract score and link maps
+    score_text = y[0, :, :, 0]
+    score_link = y[0, :, :, 1]
+
+    # Post-processing to obtain bounding boxes and polygons
+    boxes, _, _ = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4)
+    boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h)
+
+    # Create horizontal reading list
+    polys = []
+    for box in boxes:
+      poly = np.array(box).astype(np.int32).reshape((-1))
+      polys.append(poly)
+
+    horizontal_list, _ = group_text_box(polys)
+
+    return horizontal_list
+
+
+def get_boxes_dataframe(df, session):
+    # process a single partition
+    # TODO make column name more flexible
+    df["image_boxes"] = df.image_data.apply(lambda x:
+        get_boxes(
+            image_data=x, session=session,
+        ),
+    )
+
+    return df
+
+
+class DetextTextComponent(DaskTransformComponent):
+    """Component that detexts text in images, using the CRAFT model.
+    """
+
+    def __init__(self, *args) -> None:
+
+        craft_onnx = hf_hub_download(repo_id="ml6team/craft-onnx", filename="craft.onnx", repo_type="model")
+        logger.info(f"Device: {ort.get_device()}")
+        providers = [('CUDAExecutionProvider', {"cudnn_conv_algo_search": "DEFAULT"}), 'CPUExecutionProvider'] if ort.get_device() == 'GPU' else ['CPUExecutionProvider']
+        self.session = ort.InferenceSession(craft_onnx, providers=providers)
+
+    def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
+
+        # create meta
+        # needs to be a dictionary with keys = column names, values = dtypes of columns
+        # for each column in the output
+        meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)}
+        meta["image_boxes"] = np.dtype(object) 
+
+        logger.info("Detecting texts..")
+        dataframe = dataframe.map_partitions(
+            get_boxes_dataframe,
+            session=self.session,
+            meta=meta,
+        )
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    executor = DaskTransformExecutor.from_args()
+    executor.execute(DetextTextComponent)
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
new file mode 100644
index 000000000..ed1861562
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt ./
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
new file mode 100644
index 000000000..fda4b0843
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
@@ -0,0 +1,21 @@
+name: Detect text
+description: Component that detects text in images
+image: ghcr.io/ml6team/detect_text_torch_gpu:cc418db0964ee202ebf2d4fd3aaee15604604244
+
+consumes:
+  image:
+    fields:
+      data:
+        type: binary
+
+produces:
+  image:
+    fields:
+      data:
+        type: binary
+      boxes:
+        type: array
+        items:
+          type: array
+          items:
+            type: int64
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/requirements.txt b/examples/pipelines/datacomp/components/detect_text_torch_gpu/requirements.txt
new file mode 100644
index 000000000..e6bf68322
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/requirements.txt
@@ -0,0 +1,4 @@
+huggingface-hub==0.16.4
+opencv-python-headless
+scipy
+easyocr==1.7.0
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
new file mode 100644
index 000000000..c67e0d275
--- /dev/null
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
@@ -0,0 +1,142 @@
+"""This component detexts text in images, using CRAFT.
+"""
+import logging
+
+import dask.dataframe as dd
+import numpy as np
+import io
+from PIL import Image
+
+from huggingface_hub import hf_hub_download
+
+from easyocr.craft_utils import getDetBoxes, adjustResultCoordinates
+from easyocr.detection import get_detector
+from easyocr.imgproc import normalizeMeanVariance
+from easyocr.utils import group_text_box
+
+import torch
+
+from fondant.component import DaskTransformComponent
+from fondant.executor import DaskTransformExecutor
+
+logger = logging.getLogger(__name__)
+
+
+def resize_aspect_ratio_pillow(img, square_size, mag_ratio=1):
+    height, width, channel = img.shape
+
+    # magnify image size
+    target_size = mag_ratio * max(height, width)
+
+    # set original image size
+    if target_size > square_size:
+        target_size = square_size
+
+    ratio = target_size / max(height, width)
+
+    target_h, target_w = int(height * ratio), int(width * ratio)
+    img = Image.fromarray(img)
+    proc = img.resize((target_w, target_h), resample = Image.Resampling.BILINEAR)
+
+    # make canvas and paste image
+    target_h32, target_w32 = target_h, target_w
+    if target_h % 32 != 0:
+        target_h32 = target_h + (32 - target_h % 32)
+    if target_w % 32 != 0:
+        target_w32 = target_w + (32 - target_w % 32)
+    resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32)
+    resized[0:target_h, 0:target_w, :] = proc
+    target_h, target_w = target_h32, target_w32
+
+    size_heatmap = (int(target_w/2), int(target_h/2))
+
+    return resized, ratio, size_heatmap
+
+
+def get_boxes(image_data, net):
+    try:
+      image = Image.open(io.BytesIO(image_data)).convert("RGB")
+      image = np.array(image)
+    except:
+      return []
+
+    # Use Pillow instead of cv2
+    img_resized, target_ratio, size_heatmap = resize_aspect_ratio_pillow(img=image,
+                  square_size=512,
+                  mag_ratio=1.0)
+
+    ratio_h = ratio_w = 1 / target_ratio
+    x = normalizeMeanVariance(img_resized)
+    x = torch.from_numpy(x).permute(2, 0, 1).unsqueeze(0)
+
+    # Run inference and get output
+    x = x.to(net.device)
+
+    # forward pass
+    with torch.no_grad():
+        y, feature = net(x)
+
+    # Extract score and link maps
+    score_text = y[0, :, :, 0].numpy()
+    score_link = y[0, :, :, 1].numpy()
+
+    # Post-processing to obtain bounding boxes and polygons
+    boxes, _, _ = getDetBoxes(score_text, score_link, 0.5, 0.4, 0.4)
+    boxes = adjustResultCoordinates(boxes, ratio_w, ratio_h)
+
+    # Create horizontal reading list
+    polys = []
+    for box in boxes:
+      poly = np.array(box).astype(np.int32).reshape((-1))
+      polys.append(poly)
+
+    horizontal_list, _ = group_text_box(polys)
+
+    return horizontal_list
+
+
+def get_boxes_dataframe(df, net):
+    # process a single partition
+    # TODO make column name more flexible
+    df["image_boxes"] = df.image_data.apply(lambda x:
+        get_boxes(
+            image_data=x, net=net,
+        ),
+    )
+
+    return df
+
+
+class DetextTextComponent(DaskTransformComponent):
+    """Component that detexts text in images, using the CRAFT model.
+    """
+
+    def __init__(self, *args) -> None:
+
+        filepath = hf_hub_download(repo_id="nielsr/craft-pytorch", filename="net.pth", repo_type="model")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Device: {device}")
+        self.net = get_detector(filepath, device=device)
+
+    def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
+
+        # create meta
+        # needs to be a dictionary with keys = column names, values = dtypes of columns
+        # for each column in the output
+        meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)}
+        meta["image_data"] = bytes
+        meta["image_boxes"] = np.dtype(object) 
+
+        logger.info("Detecting texts..")
+        dataframe = dataframe.map_partitions(
+            get_boxes_dataframe,
+            net=self.net,
+            meta=meta,
+        )
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    executor = DaskTransformExecutor.from_args()
+    executor.execute(DetextTextComponent)
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 9f4e2b3df..5e5965144 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:3d433d9e8dfeba967236445657dd8f415726de9a
+image: ghcr.io/ml6team/download_images:cc418db0964ee202ebf2d4fd3aaee15604604244
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index 10932daa5..a7492f4c2 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -172,6 +172,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         dataframe = dataframe.dropna()  
 
         logger.info(f"Length of the final dataframe: {len(dataframe)}")
+        print("Length of final dataframe:", len(dataframe))
 
         return dataframe
 
diff --git a/examples/pipelines/datacomp/components/dummy/Dockerfile b/examples/pipelines/datacomp/components/dummy/Dockerfile
new file mode 100644
index 000000000..2378703ea
--- /dev/null
+++ b/examples/pipelines/datacomp/components/dummy/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+# System dependencies
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install git -y
+
+# Install requirements
+COPY requirements.txt /
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Install Fondant
+# This is split from other requirements to leverage caching
+ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244
+RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
+
+# Set the working directory to the component folder
+WORKDIR /component/src
+
+# Copy over src-files
+COPY src/ .
+
+ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml
new file mode 100644
index 000000000..346f3aa1b
--- /dev/null
+++ b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml
@@ -0,0 +1,9 @@
+name: Dummy component
+description: Component that downloads images based on URLs
+image: ghcr.io/ml6team/dummy:cc418db0964ee202ebf2d4fd3aaee15604604244
+
+consumes:
+  image:
+    fields:
+      data:
+        type: binary
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/dummy/requirements.txt b/examples/pipelines/datacomp/components/dummy/requirements.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/pipelines/datacomp/components/dummy/src/main.py b/examples/pipelines/datacomp/components/dummy/src/main.py
new file mode 100644
index 000000000..5700c1936
--- /dev/null
+++ b/examples/pipelines/datacomp/components/dummy/src/main.py
@@ -0,0 +1,26 @@
+"""
+Dummy component for debugging.
+"""
+import logging
+
+import dask.dataframe as dd
+
+from fondant.component import DaskTransformComponent
+from fondant.executor import DaskTransformExecutor
+
+logger = logging.getLogger(__name__)
+
+
+class DummyComponent(DaskTransformComponent):
+    """Component that downloads images based on URLs."""
+
+    def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
+
+        logger.info(f"Length of the dataframe: {len(dataframe)}")
+
+        return dataframe
+
+
+if __name__ == "__main__":
+    executor = DaskTransformExecutor.from_args()
+    executor.execute(DummyComponent)
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index b2b609961..ab25454b6 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -15,8 +15,8 @@
 pipeline = Pipeline(
     pipeline_name="datacomp-filtering-pipeline",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
-    # base_path=PipelineConfigs.BASE_PATH,
-    base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
+    base_path=PipelineConfigs.BASE_PATH,
+    # base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
 )
 client = Client(host=PipelineConfigs.HOST)
 
@@ -63,13 +63,6 @@
         "min_num_actions": 1,
     },
 )
-cluster_image_embeddings_op = ComponentOp(
-    component_dir="components/cluster_image_embeddings",
-    arguments={
-        "sample_ratio": 0.3,
-        "num_clusters": 3,
-    },
-)
 download_images_op = ComponentOp(
     component_dir="components/download_images",
     node_pool_label="node_pool",
@@ -83,15 +76,27 @@
     node_pool_name="model-inference-pool",  
     output_partition_size="disable",
 )
+# dummpy_op = ComponentOp(
+#     component_dir="components/dummy",
+# )
+detect_text_gpu_op = ComponentOp(
+    component_dir="components/detect_text_torch_gpu",
+    number_of_gpus=1,
+    node_pool_label="node_pool",  
+    node_pool_name="model-inference-pool",  
+    output_partition_size="disable",
+)
+# dummpy_op = ComponentOp(
+#     component_dir="components/dummy",
+# )
+
+
 
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
-# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
 pipeline.add_op(download_images_op, dependencies=load_from_hub_op)
-# pipeline.add_op(detect_text_op, dependencies=download_images_op)
-# pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
+pipeline.add_op(detect_text_gpu_op, dependencies=download_images_op)
 # TODO add more ops
 
 client.compile_and_run(pipeline=pipeline)
\ No newline at end of file

From be3e4b83dcde14f6c8a1b33078c99ead1d40655e Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Sun, 6 Aug 2023 10:23:41 +0200
Subject: [PATCH 61/65] Add print statement

---
 .../datacomp/components/detect_text_torch_gpu/Dockerfile  | 2 +-
 .../detect_text_torch_gpu/fondant_component.yaml          | 2 +-
 .../datacomp/components/download_images/Dockerfile        | 2 +-
 .../components/download_images/fondant_component.yaml     | 2 +-
 .../datacomp/components/download_images/src/main.py       | 8 ++++----
 src/fondant/data_io.py                                    | 1 +
 6 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
index ed1861562..54211fa15 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244
+ARG FONDANT_VERSION=fff8ca0c67f09a753fc08d8cfa13e73ab0b66890
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
index fda4b0843..c75c80fc3 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text_torch_gpu:cc418db0964ee202ebf2d4fd3aaee15604604244
+image: ghcr.io/ml6team/detect_text_torch_gpu:fff8ca0c67f09a753fc08d8cfa13e73ab0b66890
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile
index 5ff146228..e747f39e9 100644
--- a/examples/pipelines/datacomp/components/download_images/Dockerfile
+++ b/examples/pipelines/datacomp/components/download_images/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=3d433d9e8dfeba967236445657dd8f415726de9a
+ARG FONDANT_VERSION=fff8ca0c67f09a753fc08d8cfa13e73ab0b66890
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 5e5965144..6d700e80c 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:cc418db0964ee202ebf2d4fd3aaee15604604244
+image: ghcr.io/ml6team/download_images:fff8ca0c67f09a753fc08d8cfa13e73ab0b66890
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index a7492f4c2..c21543f98 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -153,10 +153,10 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         # needs to be a dictionary with keys = column names, values = dtypes of columns
         # for each column in the output
         meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)}
-        meta["data"] = np.dtype(bytes) 
-        meta["width"] = np.dtype(int) 
-        meta["height"] = np.dtype(int) 
-
+        meta["data"] = bytes
+        meta["width"] = int
+        meta["height"] = int
+        
         dataframe = dataframe.map_partitions(
             download_image_with_retry_partition,
             timeout=self.timeout,
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 4fa442747..9683e7e3d 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -272,6 +272,7 @@ def _write_subset(
             location = self.manifest.subsets[subset_name].location
 
         schema = {field.name: field.type.value for field in subset_spec.fields.values()}
+        print(f"Schema of {subset_name}:", schema)
 
         return self._create_write_task(dataframe, location=location, schema=schema)
 

From b40c7d9b579b0fc3531c871c82f39c1a395c2ee4 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Sun, 6 Aug 2023 11:30:55 +0200
Subject: [PATCH 62/65] Add more print statements

---
 .../datacomp/components/detect_text_torch_gpu/Dockerfile    | 2 +-
 .../components/detect_text_torch_gpu/fondant_component.yaml | 2 +-
 .../datacomp/components/detect_text_torch_gpu/src/main.py   | 3 +++
 .../datacomp/components/download_images/src/main.py         | 4 +++-
 examples/pipelines/datacomp/components/dummy/Dockerfile     | 2 +-
 .../datacomp/components/dummy/fondant_component.yaml        | 2 +-
 examples/pipelines/datacomp/components/dummy/src/main.py    | 2 ++
 examples/pipelines/datacomp/pipeline.py                     | 6 +++---
 src/fondant/data_io.py                                      | 3 +++
 9 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
index 54211fa15..6c3c2a504 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=fff8ca0c67f09a753fc08d8cfa13e73ab0b66890
+ARG FONDANT_VERSION=be3e4b83dcde14f6c8a1b33078c99ead1d40655e
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
index c75c80fc3..77a2590d2 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text_torch_gpu:fff8ca0c67f09a753fc08d8cfa13e73ab0b66890
+image: ghcr.io/ml6team/detect_text_torch_gpu:be3e4b83dcde14f6c8a1b33078c99ead1d40655e
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
index c67e0d275..ef04907d9 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
@@ -120,6 +120,9 @@ def __init__(self, *args) -> None:
 
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
 
+        # cast image_data to the right dtype
+        dataframe = dataframe.astype({'image_data': bytes})
+        
         # create meta
         # needs to be a dictionary with keys = column names, values = dtypes of columns
         # for each column in the output
diff --git a/examples/pipelines/datacomp/components/download_images/src/main.py b/examples/pipelines/datacomp/components/download_images/src/main.py
index c21543f98..82067c56d 100644
--- a/examples/pipelines/datacomp/components/download_images/src/main.py
+++ b/examples/pipelines/datacomp/components/download_images/src/main.py
@@ -156,7 +156,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         meta["data"] = bytes
         meta["width"] = int
         meta["height"] = int
-        
+
         dataframe = dataframe.map_partitions(
             download_image_with_retry_partition,
             timeout=self.timeout,
@@ -166,9 +166,11 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         )
 
         # rename new columns to be conform the spec
+        logger.info("Renaming columns...")
         dataframe = dataframe.rename(columns={"data": "image_data", "width": "image_width", "height":"image_height"})
 
         # Remove images that could not be fetched
+        logger.info("Dropping invalid rows...")
         dataframe = dataframe.dropna()  
 
         logger.info(f"Length of the final dataframe: {len(dataframe)}")
diff --git a/examples/pipelines/datacomp/components/dummy/Dockerfile b/examples/pipelines/datacomp/components/dummy/Dockerfile
index 2378703ea..d40d75e77 100644
--- a/examples/pipelines/datacomp/components/dummy/Dockerfile
+++ b/examples/pipelines/datacomp/components/dummy/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244
+ARG FONDANT_VERSION=be3e4b83dcde14f6c8a1b33078c99ead1d40655e
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml
index 346f3aa1b..ee39a4ac5 100644
--- a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Dummy component
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/dummy:cc418db0964ee202ebf2d4fd3aaee15604604244
+image: ghcr.io/ml6team/dummy:be3e4b83dcde14f6c8a1b33078c99ead1d40655e
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/dummy/src/main.py b/examples/pipelines/datacomp/components/dummy/src/main.py
index 5700c1936..929d04245 100644
--- a/examples/pipelines/datacomp/components/dummy/src/main.py
+++ b/examples/pipelines/datacomp/components/dummy/src/main.py
@@ -17,6 +17,8 @@ class DummyComponent(DaskTransformComponent):
     def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
 
         logger.info(f"Length of the dataframe: {len(dataframe)}")
+        print("Columns of the dataframe:", dataframe.columns)
+        print("Dyptes of the dataframe:", dataframe.dtypes)
 
         return dataframe
 
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index ab25454b6..5fed49cae 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -86,9 +86,9 @@
     node_pool_name="model-inference-pool",  
     output_partition_size="disable",
 )
-# dummpy_op = ComponentOp(
-#     component_dir="components/dummy",
-# )
+dummpy_op = ComponentOp(
+    component_dir="components/dummy",
+)
 
 
 
diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
index 9683e7e3d..25f92be41 100644
--- a/src/fondant/data_io.py
+++ b/src/fondant/data_io.py
@@ -205,6 +205,9 @@ def write_dataframe(self, dataframe: dd.DataFrame) -> None:
 
         logger.info("Creating write tasks...")
 
+        print("Dataframe columns:", dataframe.columns)
+        print("Dataframe dtypes:", dataframe.dtypes)
+
         # Turn index into an empty dataframe so we can write it
         index_df = dataframe.index.to_frame().drop(columns=["id"])
         write_index_task = self._write_subset(

From 2cee54b7804817bdc51facb9b9781233a4ea6ea7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Sun, 6 Aug 2023 12:27:03 +0200
Subject: [PATCH 63/65] More improvements

---
 .../datacomp/components/detect_text_torch_gpu/Dockerfile     | 2 +-
 .../components/detect_text_torch_gpu/fondant_component.yaml  | 2 +-
 .../datacomp/components/detect_text_torch_gpu/src/main.py    | 5 ++++-
 .../pipelines/datacomp/components/download_images/Dockerfile | 2 +-
 .../components/download_images/fondant_component.yaml        | 2 +-
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
index 6c3c2a504..a2adc1c4f 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=be3e4b83dcde14f6c8a1b33078c99ead1d40655e
+ARG FONDANT_VERSION=b40c7d9b579b0fc3531c871c82f39c1a395c2ee4
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
index 77a2590d2..20b642bfc 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text_torch_gpu:be3e4b83dcde14f6c8a1b33078c99ead1d40655e
+image: ghcr.io/ml6team/detect_text_torch_gpu:b40c7d9b579b0fc3531c871c82f39c1a395c2ee4
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
index ef04907d9..f3698efee 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/src/main.py
@@ -128,7 +128,7 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         # for each column in the output
         meta = {column: dtype for column, dtype in zip(dataframe.columns, dataframe.dtypes)}
         meta["image_data"] = bytes
-        meta["image_boxes"] = np.dtype(object) 
+        meta["image_boxes"] = np.dtype(np.int64)
 
         logger.info("Detecting texts..")
         dataframe = dataframe.map_partitions(
@@ -137,6 +137,9 @@ def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
             meta=meta,
         )
 
+        # cast image_data to the right dtype
+        dataframe = dataframe.astype({'image_data': bytes, 'image_boxes': np.dtype(np.int64)})
+
         return dataframe
 
 
diff --git a/examples/pipelines/datacomp/components/download_images/Dockerfile b/examples/pipelines/datacomp/components/download_images/Dockerfile
index e747f39e9..5dfff914c 100644
--- a/examples/pipelines/datacomp/components/download_images/Dockerfile
+++ b/examples/pipelines/datacomp/components/download_images/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=fff8ca0c67f09a753fc08d8cfa13e73ab0b66890
+ARG FONDANT_VERSION=b40c7d9b579b0fc3531c871c82f39c1a395c2ee4
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
index 6d700e80c..76504f1dc 100644
--- a/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/download_images/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Download images
 description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/download_images:fff8ca0c67f09a753fc08d8cfa13e73ab0b66890
+image: ghcr.io/ml6team/download_images:b40c7d9b579b0fc3531c871c82f39c1a395c2ee4
 
 consumes:
   image:

From 39c56436e20fb920a50c26a4d0753251993f3251 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 8 Aug 2023 10:04:24 +0200
Subject: [PATCH 64/65] Remove dummy op

---
 .../detect_text_torch_gpu/Dockerfile          |  2 +-
 .../fondant_component.yaml                    |  2 +-
 .../datacomp/components/dummy/Dockerfile      | 23 ---------------
 .../components/dummy/fondant_component.yaml   |  9 ------
 .../components/dummy/requirements.txt         |  0
 .../datacomp/components/dummy/src/main.py     | 28 -------------------
 examples/pipelines/datacomp/pipeline.py       |  8 ------
 7 files changed, 2 insertions(+), 70 deletions(-)
 delete mode 100644 examples/pipelines/datacomp/components/dummy/Dockerfile
 delete mode 100644 examples/pipelines/datacomp/components/dummy/fondant_component.yaml
 delete mode 100644 examples/pipelines/datacomp/components/dummy/requirements.txt
 delete mode 100644 examples/pipelines/datacomp/components/dummy/src/main.py

diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
index a2adc1c4f..52a84d19f 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/Dockerfile
@@ -11,7 +11,7 @@ RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=b40c7d9b579b0fc3531c871c82f39c1a395c2ee4
+ARG FONDANT_VERSION=2cee54b7804817bdc51facb9b9781233a4ea6ea7
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
index 20b642bfc..2fb1d525c 100644
--- a/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text_torch_gpu/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text_torch_gpu:b40c7d9b579b0fc3531c871c82f39c1a395c2ee4
+image: ghcr.io/ml6team/detect_text_torch_gpu:2cee54b7804817bdc51facb9b9781233a4ea6ea7
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/components/dummy/Dockerfile b/examples/pipelines/datacomp/components/dummy/Dockerfile
deleted file mode 100644
index d40d75e77..000000000
--- a/examples/pipelines/datacomp/components/dummy/Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-FROM --platform=linux/amd64 python:3.8-slim
-
-# System dependencies
-RUN apt-get update && \
-    apt-get upgrade -y && \
-    apt-get install git -y
-
-# Install requirements
-COPY requirements.txt /
-RUN pip3 install --no-cache-dir -r requirements.txt
-
-# Install Fondant
-# This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=be3e4b83dcde14f6c8a1b33078c99ead1d40655e
-RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
-
-# Set the working directory to the component folder
-WORKDIR /component/src
-
-# Copy over src-files
-COPY src/ .
-
-ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml b/examples/pipelines/datacomp/components/dummy/fondant_component.yaml
deleted file mode 100644
index ee39a4ac5..000000000
--- a/examples/pipelines/datacomp/components/dummy/fondant_component.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-name: Dummy component
-description: Component that downloads images based on URLs
-image: ghcr.io/ml6team/dummy:be3e4b83dcde14f6c8a1b33078c99ead1d40655e
-
-consumes:
-  image:
-    fields:
-      data:
-        type: binary
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/components/dummy/requirements.txt b/examples/pipelines/datacomp/components/dummy/requirements.txt
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/pipelines/datacomp/components/dummy/src/main.py b/examples/pipelines/datacomp/components/dummy/src/main.py
deleted file mode 100644
index 929d04245..000000000
--- a/examples/pipelines/datacomp/components/dummy/src/main.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
-Dummy component for debugging.
-"""
-import logging
-
-import dask.dataframe as dd
-
-from fondant.component import DaskTransformComponent
-from fondant.executor import DaskTransformExecutor
-
-logger = logging.getLogger(__name__)
-
-
-class DummyComponent(DaskTransformComponent):
-    """Component that downloads images based on URLs."""
-
-    def transform(self, dataframe: dd.DataFrame) -> dd.DataFrame:
-
-        logger.info(f"Length of the dataframe: {len(dataframe)}")
-        print("Columns of the dataframe:", dataframe.columns)
-        print("Dyptes of the dataframe:", dataframe.dtypes)
-
-        return dataframe
-
-
-if __name__ == "__main__":
-    executor = DaskTransformExecutor.from_args()
-    executor.execute(DummyComponent)
\ No newline at end of file
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 5fed49cae..8ead7d553 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -76,9 +76,6 @@
     node_pool_name="model-inference-pool",  
     output_partition_size="disable",
 )
-# dummpy_op = ComponentOp(
-#     component_dir="components/dummy",
-# )
 detect_text_gpu_op = ComponentOp(
     component_dir="components/detect_text_torch_gpu",
     number_of_gpus=1,
@@ -86,11 +83,6 @@
     node_pool_name="model-inference-pool",  
     output_partition_size="disable",
 )
-dummpy_op = ComponentOp(
-    component_dir="components/dummy",
-)
-
-
 
 
 # add ops to pipeline

From 8ce9b372b58875efc9c5b41aa5d82fb69b898d53 Mon Sep 17 00:00:00 2001
From: Niels Rogge <niels.rogge1@gmail.com>
Date: Tue, 8 Aug 2023 10:30:31 +0200
Subject: [PATCH 65/65] Update dockerfile

---
 examples/pipelines/datacomp/components/detect_text/Dockerfile   | 2 +-
 .../datacomp/components/detect_text/fondant_component.yaml      | 2 +-
 examples/pipelines/datacomp/pipeline.py                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pipelines/datacomp/components/detect_text/Dockerfile b/examples/pipelines/datacomp/components/detect_text/Dockerfile
index ad09d730e..093e48947 100644
--- a/examples/pipelines/datacomp/components/detect_text/Dockerfile
+++ b/examples/pipelines/datacomp/components/detect_text/Dockerfile
@@ -12,7 +12,7 @@ RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/
 
 # Install Fondant
 # This is split from other requirements to leverage caching
-ARG FONDANT_VERSION=cc418db0964ee202ebf2d4fd3aaee15604604244
+ARG FONDANT_VERSION=39c56436e20fb920a50c26a4d0753251993f3251
 RUN pip3 install fondant[aws,azure,gcp]@git+https://github.com/ml6team/fondant@${FONDANT_VERSION}
 
 # Set the working directory to the component folder
diff --git a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
index 357feb55f..b7364d946 100644
--- a/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
+++ b/examples/pipelines/datacomp/components/detect_text/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Detect text
 description: Component that detects text in images
-image: ghcr.io/ml6team/detect_text:cc418db0964ee202ebf2d4fd3aaee15604604244
+image: ghcr.io/ml6team/detect_text:39c56436e20fb920a50c26a4d0753251993f3251
 
 consumes:
   image:
diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
index 8ead7d553..1455da71d 100644
--- a/examples/pipelines/datacomp/pipeline.py
+++ b/examples/pipelines/datacomp/pipeline.py
@@ -88,7 +88,7 @@
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
 pipeline.add_op(download_images_op, dependencies=load_from_hub_op)
-pipeline.add_op(detect_text_gpu_op, dependencies=download_images_op)
+pipeline.add_op(detect_text_op, dependencies=download_images_op)
 # TODO add more ops
 
 client.compile_and_run(pipeline=pipeline)
\ No newline at end of file