ml6team · NielsRogge · Aug 8, 2023 · Aug 8, 2023 · Aug 8, 2023 · Aug 8, 2023
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:dev
+image: ghcr.io/ml6team/load_from_hf_hub:566a5b6fd3d422c98f7890a6e4101d8af10a42bf
 
 produces:
   dummy_variable:  #TODO: fill in here

diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
@@ -44,12 +44,25 @@ def load(self) -> dd.DataFrame:
                 )
 
         # 3) Rename columns
+        logger.info("Renaming columns...")
         dask_df = dask_df.rename(columns=self.column_name_mapping)
 
         # 4) Optional: only return specific amount of rows
-        if self.n_rows_to_load:
-            dask_df = dask_df.head(self.n_rows_to_load)
-            dask_df = dd.from_pandas(dask_df, npartitions=1)
+        if self.n_rows_to_load is not None:
+            partitions_length = 0 
+            for npartitions, partition in enumerate(dask_df.partitions):
+                if partitions_length >= self.n_rows_to_load:
+                    logger.info(f"Required number of partitions to load {self.n_rows_to_load} is {npartitions}")
+                    break 
+                partitions_length += len(partition)
+            dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions)
+            dask_df = dd.from_pandas(dask_df, npartitions=npartitions)
+
+        # Set monotonically increasing index
+        logger.info("Setting the index...")
+        dask_df["id"] = 1
+        dask_df["id"] = dask_df.id.cumsum()
+        dask_df = dask_df.set_index("id", sort=True)
 
         return dask_df
 

diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -1,6 +1,6 @@
 name: Load from hub
 description: Component that loads a dataset from the hub
-image: ghcr.io/ml6team/load_from_hf_hub:dev
+image: ghcr.io/ml6team/load_from_hf_hub:566a5b6fd3d422c98f7890a6e4101d8af10a42bf
 
 produces:
   image:

diff --git a/examples/pipelines/datacomp/pipeline.py b/examples/pipelines/datacomp/pipeline.py
@@ -7,14 +7,13 @@
 
 from pipeline_configs import PipelineConfigs
 
-from fondant.compiler import DockerCompiler
 from fondant.pipeline import ComponentOp, Pipeline, Client
 
 logger = logging.getLogger(__name__)
 
 # Initialize pipeline and client
 pipeline = Pipeline(
-    pipeline_name="Datacomp filtering pipeline",
+    pipeline_name="datacomp-filtering",
     pipeline_description="A pipeline for filtering the Datacomp dataset",
     # base_path=PipelineConfigs.BASE_PATH,
     base_path="/Users/nielsrogge/Documents/fondant_artifacts_datacomp",
@@ -65,17 +64,7 @@
 
 # add ops to pipeline
 pipeline.add_op(load_from_hub_op)
-pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
-pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
-pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
+# pipeline.add_op(filter_image_resolution_op, dependencies=load_from_hub_op)
+# pipeline.add_op(filter_complexity_op, dependencies=filter_image_resolution_op)
+# pipeline.add_op(cluster_image_embeddings_op, dependencies=filter_complexity_op)
 # TODO add more ops
-
-# compile
-if __name__ == "__main__":
-    compiler = DockerCompiler()
-    # mount the gcloud credentials to the container
-    extra_volumes = [
-        "$HOME/.config/gcloud/application_default_credentials.json:/root/.config/gcloud/application_default_credentials.json:ro"
-    ]
-    compiler.compile(pipeline=pipeline, extra_volumes=extra_volumes)
-    logger.info("Run `docker compose up` to run the pipeline.")