ml6team · RobbeSneyders · Oct 14, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
@@ -19,11 +19,11 @@ args:
     description: Optional argument, a list containing the original image column names in case the 
       dataset on the hub contains them. Used to format the image from HF hub format to a byte string.
     type: list
-    default: []
+    default: None
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: -1
+    default: None
   index_column:
     description: Column to set index to in the load component, if not specified a default globally unique index will be set
     type: str

diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
@@ -1,5 +1,6 @@
 """This component loads a seed dataset from the hub."""
 import logging
+import typing as t
 
 import dask
 import dask.dataframe as dd
@@ -19,9 +20,9 @@ def __init__(self,
                  *_,
                  dataset_name: str,
                  column_name_mapping: dict,
-                 image_column_names: list,
-                 n_rows_to_load: int,
-                 index_column: str,
+                 image_column_names: t.Optional[list],
+                 n_rows_to_load: t.Optional[int],
+                 index_column: t.Optional[str],
                  ) -> None:
         """
         Args:
@@ -57,6 +58,9 @@ def load(self) -> dd.DataFrame:
                     (subset_field_name, subset_field_name)
                 columns.append(column_name)
 
+        if self.index_column is not None:
+            columns.append(self.index_column)
+
         logger.debug(f"Columns to keep: {columns}")
         dask_df = dd.read_parquet(f"hf://datasets/{self.dataset_name}", columns=columns)
 
@@ -72,7 +76,7 @@ def load(self) -> dd.DataFrame:
         dask_df = dask_df.rename(columns=self.column_name_mapping)
 
         # 4) Optional: only return specific amount of rows
-        if self.n_rows_to_load > 0:
+        if self.n_rows_to_load is not None:
             partitions_length = 0
             npartitions = 1
             for npartitions, partition in enumerate(dask_df.partitions, start=1):
@@ -84,8 +88,8 @@ def load(self) -> dd.DataFrame:
             dask_df = dask_df.head(self.n_rows_to_load, npartitions=npartitions)
             dask_df = dd.from_pandas(dask_df, npartitions=npartitions)
 
-        # 5) Set the index
-        if self.index_column == "None":
+        # 4) Set the index
+        if self.index_column is None:
             logger.info(
                 "Index column not specified, setting a globally unique index",
             )

diff --git a/components/load_from_parquet/src/main.py b/components/load_from_parquet/src/main.py
@@ -54,6 +54,9 @@ def load(self) -> dd.DataFrame:
                         (subset_field_name, subset_field_name)
                     columns.append(column_name)
 
+        if self.index_column is not None:
+            columns.append(self.index_column)
+
         logger.debug(f"Columns to keep: {columns}")
         dask_df = dd.read_parquet(self.dataset_uri, columns=columns)
 

diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py
@@ -39,8 +39,8 @@ def __init__(self,
             hf_token: str,
             username: str,
             dataset_name: str,
-            image_column_names: list,
-            column_name_mapping: dict,
+            image_column_names: t.Optional[list],
+            column_name_mapping: t.Optional[dict],
     ):
         """
         Args:
@@ -87,7 +87,7 @@ def write(
         # Map image column to hf data format
         feature_encoder = datasets.Image(decode=True)
 
-        if self.image_column_names:
+        if self.image_column_names is not None:
             for image_column_name in self.image_column_names:
                 dataframe[image_column_name] = dataframe[image_column_name].map(
                     lambda x: convert_bytes_to_image(x, feature_encoder),

diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml b/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml
@@ -15,4 +15,4 @@ args:
   n_records_to_download:
     description: Number of records to download
     type: int
-    default: -1
+    default: None
diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py b/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py
@@ -18,7 +18,7 @@ def __init__(
         self,
         *_,
         common_crawl_indices: t.List[str],
-        n_records_to_download: int,
+        n_records_to_download: t.Optional[int] = None,
     ):
         self.index_urls = [
             self.build_index_url(index_name) for index_name in common_crawl_indices
@@ -38,7 +38,7 @@ def load(self) -> dd.DataFrame:
             warc_urls.extend([line.decode() for line in extracted.split(b"\n")])
 
         df = pd.DataFrame(warc_urls, columns=["warc_url"])
-        if self.n_records_to_download > 0:
+        if self.n_records_to_download is not None:
             df = df.head(self.n_records_to_download)
 
         return dd.from_pandas(df, npartitions=len(df) // 100)
diff --git a/...s/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml b/...s/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml
@@ -12,4 +12,4 @@ args:
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: -1
+    default: None
diff --git a/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py b/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py
@@ -114,7 +114,7 @@ def load(self) -> dd.DataFrame:
 
         pandas_df = pd.DataFrame(prompts, columns=["prompts_text"])
 
-        if self.n_rows_to_load > 0:
+        if self.n_rows_to_load:
             pandas_df = pandas_df.head(self.n_rows_to_load)
 
         df = dd.from_pandas(pandas_df, npartitions=1)

diff --git a/...ines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml b/...ines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml
@@ -31,8 +31,8 @@ args:
   image_column_names:
     description: A list containing the image column names. Used to format to image to HF hub format
     type: list
-    default: []
+    default: None
   column_name_mapping:
     description: Mapping of the consumed fondant column names to the written hub column names
     type: dict
-    default: {}
+    default: None
diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -50,11 +50,11 @@ args:
     description: Optional argument, a list containing the original image column names in case the 
       dataset on the hub contains them. Used to format the image from HF hub format to a byte string.
     type: list
-    default: []
+    default: None
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: -1
+    default: None
   index_column:
     description: Column to set index to in the load component, if not specified a default globally unique index will be set
     type: str

diff --git a/...es/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml b/...es/pipelines/finetune_stable_diffusion/components/load_from_hf_hub/fondant_component.yaml
@@ -24,11 +24,11 @@ args:
     description: Optional argument, a list containing the original image column names in case the 
        dataset on the hub contains them. Used to format the image from HF hub format to a byte string.
     type: list
-    default: []
+    default: None
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: -1
+    default: None
   index_column:
     description: Column to set index to in the load component, if not specified a default globally unique index will be set
     type: str

diff --git a/...les/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml b/...les/pipelines/finetune_stable_diffusion/components/write_to_hf_hub/fondant_component.yaml
@@ -26,8 +26,8 @@ args:
   image_column_names:
     description: A list containing the image column names. Used to format to image to HF hub format
     type: list
-    default: []
+    default: None
   column_name_mapping:
     description: Mapping of the consumed fondant column names to the written hub column names
     type: dict
-    default: {}
+    default: None
diff --git a/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml b/examples/pipelines/starcoder/components/load_from_hub/fondant_component.yaml
@@ -33,11 +33,11 @@ args:
     description: A list containing the original hub image column names. Used to format the image
       from HF hub format to a byte string
     type: list
-    default: []
+    default: None
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: -1
+    default: None
   index_column:
     description: Column to set index to in the load component, if not specified a default globally unique index will be set
     type: str

diff --git a/src/fondant/component_spec.py b/src/fondant/component_spec.py
@@ -34,7 +34,7 @@ class Argument:
     name: str
     description: str
     type: str
-    default: t.Any = None
+    default: t.Optional[t.Any] = None
     optional: t.Optional[bool] = False
 
     @property
@@ -47,7 +47,8 @@ def python_type(self) -> t.Any:
             "dict": json.loads,
             "list": json.loads,
         }
-        return lookup[self.type]
+        map_fn = lookup[self.type]
+        return lambda value: map_fn(value) if value != "None" else None  # type: ignore
 
     @property
     def kubeflow_type(self) -> str:
@@ -230,7 +231,7 @@ def default_arguments(self) -> t.Dict[str, Argument]:
                 description="The number of rows to load per partition. \
                         Set to override the automatic partitioning",
                 type="int",
-                default=-1,
+                optional=True,
             ),
             "cache": Argument(
                 name="cache",
@@ -286,9 +287,9 @@ def convert_arguments(fondant_component: ComponentSpec):
         for arg in fondant_component.args.values():
             arg_type_dict = {}
 
-            if arg.optional or arg.default is not None:
+            if arg.optional and arg.default is None:
                 arg_type_dict["isOptional"] = True
-            if arg.default is not None:
+            if arg.default is not None and arg.default != "None":
                 arg_type_dict["defaultValue"] = arg.default
 
             args[arg.name] = {

diff --git a/src/fondant/data_io.py b/src/fondant/data_io.py
@@ -24,7 +24,7 @@ def __init__(
         *,
         manifest: Manifest,
         component_spec: ComponentSpec,
-        input_partition_rows: int = -1,
+        input_partition_rows: t.Optional[int] = None,
     ):
         super().__init__(manifest=manifest, component_spec=component_spec)
         self.input_partition_rows = input_partition_rows
@@ -38,7 +38,20 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame:
         """
         n_workers: int = os.cpu_count()  # type: ignore
 
-        if self.input_partition_rows > 1:
+        if self.input_partition_rows is None:
+            n_partitions = dataframe.npartitions
+            if n_partitions < n_workers:  # type: ignore
+                logger.info(
+                    f"The number of partitions of the input dataframe is {n_partitions}. The "
+                    f"available number of workers is {n_workers}.",
+                )
+                dataframe = dataframe.repartition(npartitions=n_workers)
+                logger.info(
+                    f"Repartitioning the data to {n_workers} partitions before processing"
+                    f" to maximize worker usage",
+                )
+
+        elif self.input_partition_rows > 1:
-        elif self.input_partition_rows > 1:
+        elif self.input_partition_rows >= 1:
-        elif self.input_partition_rows > 1:
+        elif self.input_partition_rows >= 1:
             # Only load the index column to trigger a faster compute of the rows
             total_rows = len(dataframe.index)
             # +1 to handle any remainder rows
@@ -56,23 +69,11 @@ def partition_loaded_dataframe(self, dataframe: dd.DataFrame) -> dd.DataFrame:
                     f" all available workers {n_partitions} out of {n_workers} are used.",
                 )
 
-        elif self.input_partition_rows == -1:
-            n_partitions = dataframe.npartitions
-            if n_partitions < n_workers:  # type: ignore
-                logger.info(
-                    f"The number of partitions of the input dataframe is {n_partitions}. The "
-                    f"available number of workers is {n_workers}.",
-                )
-                dataframe = dataframe.repartition(npartitions=n_workers)
-                logger.info(
-                    f"Repartitioning the data to {n_workers} partitions before processing"
-                    f" to maximize worker usage",
-                )
         else:
             msg = (
                 f"{self.input_partition_rows} is not a valid value for the 'input_partition_rows' "
                 f"parameter. It should be a number larger than 0 to indicate the number of "
-                f"expected rows per partition, or '-1' to let Fondant optimize the number of "
+                f"expected rows per partition, or None to let Fondant optimize the number of "
                 f"partitions based on the number of available workers."
             )
             raise ValueError(

diff --git a/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/kubeflow_pipeline.yml
@@ -21,7 +21,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:
@@ -48,7 +47,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:
@@ -75,7 +73,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:

diff --git a/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_1/vertex_pipeline.yml
@@ -20,7 +20,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:
@@ -47,7 +46,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:
@@ -74,7 +72,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:

diff --git a/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/kubeflow_pipeline.yml
@@ -21,7 +21,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1.0
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:
@@ -52,7 +51,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1.0
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:

diff --git a/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml b/tests/example_pipelines/compiled_pipeline/example_2/vertex_pipeline.yml
@@ -21,7 +21,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata:
@@ -52,7 +51,6 @@ components:
           isOptional: true
           parameterType: STRING
         input_partition_rows:
-          defaultValue: -1.0
           isOptional: true
           parameterType: NUMBER_INTEGER
         metadata: