ml6team · GeorgesLorre · May 2, 2023 · Apr 27, 2023 · Apr 28, 2023 · May 2, 2023
diff --git a/fondant/dataset.py b/fondant/dataset.py
@@ -9,7 +9,7 @@
 import dask.dataframe as dd
 
 from fondant.component_spec import FondantComponentSpec
-from fondant.manifest import Manifest, Index
+from fondant.manifest import Manifest
 from fondant.schema import Type, Field
 
 logger = logging.getLogger(__name__)
@@ -30,9 +30,7 @@ def __init__(self, manifest: Manifest):
             "__null_dask_index__": "int64",
         }
 
-    def _load_subset(
-        self, name: str, fields: t.List[str], index: t.Optional[Index] = None
-    ) -> dd.DataFrame:
+    def _load_subset(self, name: str, fields: t.List[str]) -> dd.DataFrame:
         # get subset from the manifest
         subset = self.manifest.subsets[name]
         # get remote path
@@ -49,13 +47,6 @@ def _load_subset(
             columns=fields,
         )
 
-        # filter on default index of manifest if no index is provided
-        if index is None:
-            index_df = self._load_index()
-            ids = index_df["id"].compute()
-            sources = index_df["source"].compute()
-            df = df[df["id"].isin(ids) & df["source"].isin(sources)]
-
         # add subset prefix to columns
         df = df.rename(
             columns={
@@ -70,26 +61,19 @@ def _load_index(self):
         index = self.manifest.index
         # get remote path
         remote_path = index.location
-
-        df = dd.read_parquet(remote_path)
-
-        if list(df.columns) != ["id", "source"]:
-            raise ValueError(
-                f"Index columns should be 'id' and 'source', found {df.columns}"
-            )
+        # load index from parquet, expecting id and source columns
+        df = dd.read_parquet(remote_path, columns=["id", "source"])
 
         return df
 
     def load_dataframe(self, spec: FondantComponentSpec) -> dd.DataFrame:
-        subset_dfs = []
+        # load index into dataframe
+        df = self._load_index()
         for name, subset in spec.input_subsets.items():
             fields = list(subset.fields.keys())
             subset_df = self._load_subset(name, fields)
-            subset_dfs.append(subset_df)
-
-        # return a single dataframe with column_names called subset_field
-        # TODO perhaps leverage dd.merge here instead
-        df = dd.concat(subset_dfs)
+            # left joins -> filter on index
+            df = dd.merge(df, subset_df, on=["id", "source"], how="left")
 
         logging.info("Columns of dataframe:", list(df.columns))
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -41,8 +41,9 @@ classifiers = [
 [tool.poetry.dependencies]
 python = "^3.8"
 jsonschema = "^4.17.3"
-dask = "^2022.2.0"
+dask = {extras = ["dataframe"], version = "^2023.4.1"}
 
+pyarrow = "^11.0.0"
 kfp = { version = "^1.8.19", optional = true }
 kubernetes = { version = "^18.20.0", optional = true }
 pandas = { version = "^1.3.5", optional = true }

diff --git a/tests/example_data/components/1.yaml b/tests/example_data/components/1.yaml
@@ -0,0 +1,21 @@
+name: Test component 1
+description: This is an example component
+image: example_component:latest
+
+input_subsets:
+  properties:
+    fields:
+      Name:
+        type: "utf8"
+      HP:
+        type: "int32"
+  types:
+    fields:
+      Type 1:
+        type: "utf8"
+      Type 2:
+        type: "utf8"
+args:
+  storage_args:
+    description: Storage arguments
+    type: str
diff --git a/tests/example_data/manifest.json b/tests/example_data/manifest.json
@@ -0,0 +1,34 @@
+{
+    "metadata": {
+      "base_path": "tests/example_data/subsets",
+      "run_id": "12345",
+      "component_id": "67890"
+    },
+    "index": {
+      "location": "/index"
+    },
+    "subsets": {
+      "properties": {
+        "location": "/properties",
+        "fields": {
+            "Name": {
+                "type": "utf8"
+            },
+            "HP": {
+                "type": "int32"
+            }
+        }
+      },
+      "types": {
+        "location": "/types",
+        "fields": {
+            "Type 1": {
+                "type": "utf8"
+            },
+            "Type 2": {
+                "type": "utf8"
+            }
+        }
+      }
+    }
+  }
diff --git a/tests/example_data/raw/split.py b/tests/example_data/raw/split.py
@@ -0,0 +1,39 @@
+"""
+This is a small script to split the raw data into different subsets to be used while testing.
+
+The data is the 151 first pokemon and the following fields are available:
+
+'id', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
+'Sp. Atk', 'Sp. Def', 'Speed', 'source', 'Legendary'
+
+
+"""
+from pathlib import Path
+import dask.dataframe as dd
+
+data_path = Path(__file__).parent
+output_path = Path(__file__).parent.parent / "subsets/"
+
+
+def split_into_subsets():
+    # read in complete dataset
+    master_df = dd.read_parquet(path=data_path / "testset.parquet")
+
+    # create index subset
+    index_df = master_df[["id", "source"]]
+    index_df.set_index("id")
+    index_df.to_parquet(output_path / "index")
+
+    # create properties subset
+    properies_df = master_df[["id", "source", "Name", "HP"]]
+    properies_df.set_index("id")
+    properies_df.to_parquet(output_path / "properties")
+
+    # create types subset
+    types_df = master_df[["id", "source", "Type 1", "Type 2"]]
+    types_df.set_index("id")
+    types_df.to_parquet(output_path / "types")
+
+
+if __name__ == "__main__":
+    split_into_subsets()
diff --git a/tests/example_data/raw/testset.parquet b/tests/example_data/raw/testset.parquet
diff --git a/tests/example_data/subsets/index/part.0.parquet b/tests/example_data/subsets/index/part.0.parquet
diff --git a/tests/example_data/subsets/properties/part.0.parquet b/tests/example_data/subsets/properties/part.0.parquet
diff --git a/tests/example_data/subsets/types/part.0.parquet b/tests/example_data/subsets/types/part.0.parquet
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -0,0 +1,40 @@
+import json
+import pytest
+import dask.dataframe as dd
+from pathlib import Path
+
+from fondant.manifest import Manifest
+from fondant.dataset import FondantDataset
+from fondant.component_spec import FondantComponentSpec
+
+manifest_path = Path(__file__).parent / "example_data/manifest.json"
+component_spec_path = Path(__file__).parent / "example_data/components/1.yaml"
+
+
+@pytest.fixture
+def manifest():
+    return Manifest.from_file(manifest_path)
+
+
+@pytest.fixture
+def component_spec():
+    return FondantComponentSpec.from_file(component_spec_path)
+
+
+def test_load_index(manifest):
+    fds = FondantDataset(manifest)
+    assert len(fds._load_index()) == 151
+
+
+def test_merge_subsets(manifest, component_spec):
+    fds = FondantDataset(manifest=manifest)
+    df = fds.load_dataframe(spec=component_spec)
+    assert len(df) == 151
+    assert list(df.columns) == [
+        "id",
+        "source",
+        "properties_Name",
+        "properties_HP",
+        "types_Type 1",
+        "types_Type 2",
+    ]